# Predict the Happiness

* [competition url](https://www.hackerearth.com/problem/machine-learning/predict-the-happiness/)
* [fasttext](https://github.com/facebookresearch/fastText)

**Task:** solve sentiment analysis problem (text classification). Classify TripAdvisor revies as "happy" or "not happy"

In [8]:
import re
import pandas as pd
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import (train_test_split,
                                     StratifiedKFold,
                                     GridSearchCV)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.externals.joblib import Memory
from nltk.tokenize import word_tokenize

In [9]:
en_stopwords = stopwords.words('english')

In [10]:
df = pd.read_csv('/home/ubuntu/train.csv')
df.drop(['User_ID'], 1, inplace=True)



df['Is_Response'] = LabelBinarizer().fit_transform(df['Is_Response'].values)
df['Browser_Used'] = LabelEncoder().fit_transform(df['Browser_Used'].values)
df['Device_Used'] = LabelEncoder().fit_transform(df['Device_Used'].values)
df['Description'] = df['Description'].map(lambda x: x.replace('\n', ' '))

# an exta collumn for fasttext
df['y'] = df['Is_Response'].map(lambda x: '__label__{}'.format(x))

train_df, valid_df = train_test_split(df,
                                     test_size=0.2,
                                     stratify=df['Is_Response'].values,
                                     random_state=1234)

In [11]:
train_df.head()

Unnamed: 0,Description,Browser_Used,Device_Used,Is_Response,y
15200,We stayed prior to a cruise. Since we had just...,2,0,0,__label__0
27682,The hotel rooms were under renovation and the ...,2,2,1,__label__1
20617,"Front desk staff were excellent, actively look...",1,1,0,__label__0
28803,Spent a wonderful night at the Amalfi with fri...,6,1,0,__label__0
25801,"The hotel was clean, room was clean, the conti...",1,1,1,__label__1


In [15]:
X_train, y_train = train_df.drop('Is_Response', 1), train_df['Is_Response'].values
X_valid, y_valid = valid_df.drop('Is_Response', 1), valid_df['Is_Response'].values

In [16]:
class ColSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column, reshape=False):
        self.column = column
        self.reshape = reshape
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X =  X[self.column].values
        if self.reshape:
            X = X.reshape(-1, 1)
        return X

In [17]:
VOCAB_SIZE = 15000

def preprocessor(text):
    text = re.sub('[^\w]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text.strip()

browser_pipe = make_pipeline(ColSelector('Browser_Used', True), OneHotEncoder())
device_pipe = make_pipeline(ColSelector('Device_Used', True), OneHotEncoder())
text_pipe = make_pipeline(ColSelector('Description'), 
                          TfidfVectorizer(max_features=VOCAB_SIZE,
                                          stop_words=en_stopwords,
                                          tokenizer=word_tokenize,
                                          min_df=5,
                                          sublinear_tf=True,
                                          preprocessor=preprocessor,
                                          ngram_range=(1, 2)))

In [18]:
feature_extraction_pipeline = make_union(browser_pipe, device_pipe, text_pipe)
feature_extraction_pipeline.fit(X_train, y_train)

FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(memory=None,
     steps=[('colselector', ColSelector(column='Browser_Used', reshape=True)), ('onehotencoder', OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True))])), ('pipelin...   tokenizer=<function word_tokenize at 0x7f6f34a08c80>, use_idf=True,
        vocabulary=None))]))],
       transformer_weights=None)

In [19]:
X_train_features = feature_extraction_pipeline.transform(X_train)
X_valid_features = feature_extraction_pipeline.transform(X_valid)

In [21]:
cv = StratifiedKFold().split(X_train_features, y_train)

param_space = {
    'n_estimators': [10, 100, 200, 1000, 1200, 1500],
    'max_depth': [20, 100, None],
}

gs = GridSearchCV(RandomForestClassifier(), param_space, scoring='f1_weighted', cv=cv, verbose=8, n_jobs=-1)
gs.fit(X_train_features, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] n_estimators=10, max_depth=20 ...................................
[CV] n_estimators=10, max_depth=20 ...................................
[CV] n_estimators=10, max_depth=20 ...................................
[CV] n_estimators=100, max_depth=20 ..................................
[CV]  n_estimators=10, max_depth=20, score=0.7402471750176441, total=   1.7s
[CV] n_estimators=100, max_depth=20 ..................................
[CV]  n_estimators=10, max_depth=20, score=0.7366480244129197, total=   1.8s
[CV] n_estimators=100, max_depth=20 ..................................
[CV]  n_estimators=10, max_depth=20, score=0.7493637874207784, total=   1.9s
[CV] n_estimators=200, max_depth=20 ..................................
[CV]  n_estimators=100, max_depth=20, score=0.7475293344987161, total=  11.5s
[CV] n_estimators=200, max_depth=20 ..................................
[CV]  n_estimators=100, max_depth=20, score=0.7379799315107213

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.8min


[CV]  n_estimators=1000, max_depth=20, score=0.7346614267045284, total= 1.3min
[CV] n_estimators=1200, max_depth=20 .................................
[CV]  n_estimators=1000, max_depth=20, score=0.7446808318257195, total= 1.3min
[CV] n_estimators=1500, max_depth=20 .................................
[CV]  n_estimators=1200, max_depth=20, score=0.736618176466816, total= 1.5min
[CV] n_estimators=1500, max_depth=20 .................................
[CV]  n_estimators=1200, max_depth=20, score=0.7339143880940687, total= 1.5min
[CV] n_estimators=1500, max_depth=20 .................................
[CV]  n_estimators=1200, max_depth=20, score=0.7441656700801085, total= 1.5min
[CV] n_estimators=10, max_depth=100 ..................................
[CV]  n_estimators=10, max_depth=100, score=0.7910049608732277, total=   3.7s
[CV] n_estimators=10, max_depth=100 ..................................
[CV]  n_estimators=10, max_depth=100, score=0.8021557999327927, total=   3.6s
[CV] n_estimators=10, ma

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 20.0min


[CV]  n_estimators=10, max_depth=None, score=0.779777130422683, total=   3.8s
[CV] n_estimators=10, max_depth=None .................................
[CV]  n_estimators=10, max_depth=None, score=0.7824666353903412, total=   3.8s
[CV] n_estimators=10, max_depth=None .................................
[CV]  n_estimators=10, max_depth=None, score=0.7868018987619609, total=   3.9s
[CV] n_estimators=100, max_depth=None ................................
[CV]  n_estimators=100, max_depth=None, score=0.8293775544172906, total=  36.8s
[CV] n_estimators=100, max_depth=None ................................
[CV]  n_estimators=100, max_depth=None, score=0.830196336087254, total=  37.0s
[CV] n_estimators=100, max_depth=None ................................
[CV]  n_estimators=100, max_depth=None, score=0.8372422535441857, total=  37.3s
[CV] n_estimators=200, max_depth=None ................................
[CV]  n_estimators=1500, max_depth=100, score=0.828840721720178, total= 8.8min
[CV] n_estimators=20

[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 47.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 47.5min finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x7f6f32427308>,
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 100, 200, 1000, 1200, 1500], 'max_depth': [20, 100, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_weighted', verbose=8)

In [22]:
y_pred = gs.predict(X_valid_features)
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

          0       0.84      0.97      0.90      5305
          1       0.92      0.60      0.72      2482

avg / total       0.86      0.85      0.85      7787



# Fasttext

In [24]:
def save_fasttext(df, path):
    with open(path, 'w+') as f:
        for _, row in df.iterrows():
            f.write('{} {}\n'.format(row['y'], row['Description']))
            
            
save_fasttext(train_df, '/tmp/train.csv')
save_fasttext(valid_df, '/tmp/test.csv')

### run training

In [None]:
!./fasttext supervised -input /tmp/train.csv\
    -output /tmp/model\
    -verbose 10\
    -dim 300\
    -retrain\
    -pretrainedVectors cc.hy.300.vec\
    -epoch 10

### build predictions on a test set

In [None]:
!./fasttext predict /tmp/model.bin /tmp/test.csv  > /tmp/pred.csv

### calc metrics

In [25]:
y_pred = pd.read_csv('/tmp/pred.csv', header=None, names=['y'])['y'].values
print(classification_report(valid_df['y'].values, y_pred))

             precision    recall  f1-score   support

 __label__0       0.91      0.92      0.91      5305
 __label__1       0.82      0.79      0.81      2482

avg / total       0.88      0.88      0.88      7787

