In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold


In [4]:
data = pd.read_excel('favorites.xlsx')

def absolute_correlations(col, df=data):
    #absolute_values = np.abs(df[col])
    corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
    corrs['absol'] = np.abs(corrs['correlation'])
    return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)

In [10]:
intelligence_scale = (
    (8-data['I have difficulty understanding abstract ideas.']) +
    (8-data["I'm not interested in abstract ideas."]) +
    data["I'm a fast learner."]
)

intelligence = np.where(intelligence_scale > intelligence_scale.mean(), 1, 0)

In [72]:
dont_like = data['10 things you DON\'T like']
x = (data['25 things you like'].fillna(' ').map(str) + ' ' + 
        data['Your "introduction"'].fillna(' ')
)

In [78]:
param_grid = {
    'tfidfvectorizer__min_df': [3],#np.arange(1,11,1),
    'tfidfvectorizer__max_df': np.arange(.08, .12, .005),
    'xgbclassifier__n_estimators': [100],
    'xgbclassifier__max_depth': np.arange(2,7,1)
}

kf = StratifiedKFold(n_splits=3)

pl = make_pipeline(
    TfidfVectorizer(),
    XGBClassifier()
)

grid = GridSearchCV(pl, param_grid, cv=kf, scoring='roc_auc')\
.fit(x, intelligence)

clf = grid.best_estimator_

clf

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.085, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])

In [79]:
clf.named_steps

{'tfidfvectorizer': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.085, max_features=None, min_df=3,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 'xgbclassifier': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
        subsample=1)}

In [80]:
cv = cross_val_score(clf, x, intelligence, cv=kf, scoring='roc_auc')
cv.mean(), cv.std()

(0.711978088336784, 0.07348940223023034)

In [81]:
words = {}

for word, num in clf.named_steps['tfidfvectorizer'].vocabulary_.items():
    prob = clf.predict_proba([word])[0][1]
    words[word] = prob

df_topwords = pd.DataFrame([words]).transpose()
df_topwords.columns = ['probability']

df_topwords.sort_values('probability', ascending=False, inplace=True)

In [82]:
df_topwords

Unnamed: 0,probability
ocean,0.088790
appreciate,0.030618
full,0.021123
last,0.016759
spent,0.013254
comedy,0.011819
helping,0.011545
re,0.008046
24,0.007850
green,0.007792
