In [9]:
import gc
import time
import numpy as np
import pandas as pd
import warnings

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as Tfidf
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder

import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

from project_utils import kd_utils
from importlib import reload

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000)
from sklearn.pipeline import make_pipeline, make_union, Pipeline
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [10]:
train=pd.read_csv('../input/train.csv', sep='\t')
test=pd.read_csv('../input/test.csv', sep='\t')

train['question_dt']=pd.to_datetime(train['question_utc'], unit='s')
test['question_dt']=pd.to_datetime(test['question_utc'], unit='s')
train['answer_dt']=pd.to_datetime(train['answer_utc'], unit='s')
test['answer_dt']=pd.to_datetime(test['answer_utc'], unit='s')

In [11]:
test['answer_score']=np.nan

In [12]:
traintest=train.append(test)

In [13]:
def splittext(x):
    return x.replace('.', ' ').replace(',', ' ').replace(':', ' ').replace(';', ' ').replace('#', ' ').replace('!',
                                                                                                               ' ').split(
        ' ')

In [14]:
traintest['qlenchar'] = traintest.question_text.apply(len)
traintest['qlenword'] = traintest.question_text.apply(lambda x: len(splittext(x)))
traintest['alenchar'] = traintest.answer_text.apply(len)
traintest['alenword'] = traintest.answer_text.apply(lambda x: len(splittext(x)))

traintest['difflenchar'] = traintest.qlenchar - traintest.alenchar
traintest['difflenword'] = traintest.qlenword - traintest.alenword

traintest['divlenchar'] = traintest.qlenchar / traintest.alenchar
traintest['divlenword'] = traintest.qlenword / traintest.alenword

traintest['idivlenchar'] = traintest.alenchar / traintest.qlenchar
traintest['idivlenword'] = traintest.alenword / traintest.qlenword

traintest['subreddit_le'] = LabelEncoder().fit_transform(traintest.subreddit)
traintest['qid'] = LabelEncoder().fit_transform(traintest.question_id)

# traintest['qdt_dow'] = pd.to_datetime(traintest.question_utc, origin='unix', unit='s').dt.dayofweek
# traintest['qdt_hour'] = pd.to_datetime(traintest.question_utc, origin='unix', unit='s').dt.hour

# traintest['adt_dow'] = pd.to_datetime(traintest.answer_utc, origin='unix', unit='s').dt.dayofweek
# traintest['adt_hour'] = pd.to_datetime(traintest.answer_utc, origin='unix', unit='s').dt.hour

traintest['question_score_l1p'] = np.log1p(traintest.question_score)
traintest['answer_score_l1p'] = np.log1p(traintest.answer_score)

traintest['qboldwords'] = traintest.question_text.apply(lambda x: np.sum(x.isupper() for x in splittext(x) if len(x) > 1))
traintest['aboldwords'] = traintest.answer_text.apply(lambda x: np.sum(x.isupper() for x in splittext(x) if len(x) > 1))

In [15]:
traintest['acount'] = traintest.groupby('qid').id.transform(lambda x:x.count())

EMO= ['!',
     ';-)',
    ':-)',
    ':-(',
    'fuck',
    'poop',
    'shit',
    'garbage',
    'crap',
    'dumb',
    'excellent',
    'brilliant',
    'good',
    'bad',
    'poor']

for i,emo in enumerate(EMO):
    traintest['qEMO'+str(i)] = traintest.question_text.apply(lambda x:x.lower().count(emo))
    traintest['aEMO'+str(i)] = traintest.answer_text.apply(lambda x:x.lower().count(emo))
    print(i,emo,traintest['qEMO'+str(i)].sum(),traintest['aEMO'+str(i)].sum())

0 ! 179762 180502
1 ;-) 190 301
2 :-) 452 685
3 :-( 206 182
4 fuck 107719 69318
5 poop 2323 1404
6 shit 86414 65498
7 garbage 3702 2552
8 crap 8551 5574
9 dumb 10286 7458
10 excellent 2292 1999
11 brilliant 1632 1344
12 good 102258 88697
13 bad 51654 41034
14 poor 15162 11509


In [16]:
train=traintest[0:len(train)]
test=traintest[len(train):]

In [17]:
train.columns

Index(['answer_dt', 'answer_score', 'answer_text', 'answer_utc', 'id',
       'question_dt', 'question_id', 'question_score', 'question_text',
       'question_utc', 'subreddit', 'qlenchar', 'qlenword', 'alenchar',
       'alenword', 'difflenchar', 'difflenword', 'divlenchar', 'divlenword',
       'idivlenchar', 'idivlenword', 'subreddit_le', 'qid',
       'question_score_l1p', 'answer_score_l1p', 'qboldwords', 'aboldwords',
       'acount', 'qEMO0', 'aEMO0', 'qEMO1', 'aEMO1', 'qEMO2', 'aEMO2', 'qEMO3',
       'aEMO3', 'qEMO4', 'aEMO4', 'qEMO5', 'aEMO5', 'qEMO6', 'aEMO6', 'qEMO7',
       'aEMO7', 'qEMO8', 'aEMO8', 'qEMO9', 'aEMO9', 'qEMO10', 'aEMO10',
       'qEMO11', 'aEMO11', 'qEMO12', 'aEMO12', 'qEMO13', 'aEMO13', 'qEMO14',
       'aEMO14'],
      dtype='object')

In [21]:
dropcols=['answer_dt', 'answer_score', 'answer_text', 'answer_utc', 'id',
       'question_dt', 'question_id', 'question_score', 'question_text',
       'question_utc', 'subreddit','qid','answer_score_l1p']

train_fs3=train.drop(dropcols, axis=1)
test_fs3=test.drop(dropcols, axis=1)

In [22]:
kd_utils.pickle_data('../feature_data/train_fs3.pkl', train_fs3)
kd_utils.pickle_data('../feature_data/test_fs3.pkl', test_fs3)