In [96]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold

from scipy.stats import mode

In [4]:
data = pd.read_excel('favorites.xlsx')

def absolute_correlations(col, df=data):
    #absolute_values = np.abs(df[col])
    corrs = pd.DataFrame(df.select_dtypes(include=[np.number]).corrwith(df[col]), columns=['correlation'])
    corrs['absol'] = np.abs(corrs['correlation'])
    return corrs.sort_values('absol', ascending=False).drop('absol', axis=1).tail(len(corrs)-1)

In [87]:
# Estimating intelligence based on these questions
intelligence_scale = (
    (8-data['I have difficulty understanding abstract ideas.']) +
    (8-data["I'm not interested in abstract ideas."]) +
    data["I'm a fast learner."]
)

# Splitting people into above average and below average
y = np.where(
    intelligence_scale > intelligence_scale.mean(), 1, 0)

# Things smart people like

In [84]:
x = data['25 things you like'].fillna(' ')

In [85]:
param_grid = {
    'tfidfvectorizer__min_df': [3],#np.arange(1,11,1),
    'tfidfvectorizer__max_df': np.arange(.08, .12, .005),
    'xgbclassifier__n_estimators': [100],
    'xgbclassifier__max_depth': np.arange(2,7,1)
}

kf = StratifiedKFold(n_splits=3)

pl = make_pipeline(
    TfidfVectorizer(),
    XGBClassifier()
)

grid = GridSearchCV(pl, param_grid, cv=kf, scoring='roc_auc')\
.fit(x, y)

clf = grid.best_estimator_

clf

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.10500000000000002, max_features=None,
        min_df=3, ngram_range=(1, 1), norm='l2', preprocesso...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])

In [88]:
# The model does a good job of predicting intelligence from interests

cv = cross_val_score(clf, x, y, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.7959799861973775, 0.016360165681333392)

In [97]:
words = {}

for word, num in clf.named_steps['tfidfvectorizer'].vocabulary_.items():
    prob = clf.predict_proba([word])[0][1]
    words[word] = prob

df_topwords = pd.DataFrame([words]).transpose()
df_topwords.columns = ['probability']

df_topwords.sort_values('probability', ascending=False, inplace=True)

In [99]:
df_mode = mode(df_topwords['probability'])[0][0]

print()
print('Default prediction for this model is:', df_mode)
print()

df_topwords[df_topwords.probability != df_mode]


Default prediction for this model is: 0.014619269408285618



Unnamed: 0,probability
harry,0.047004
comedy,0.039199
others,0.035943
show,0.031275
shows,0.029064
helping,0.021652
green,0.018953
up,0.017309
conversations,0.017136
dad,0.017021


# I am ______

Students were asked to fill in the blank 10 times, saying whatever came to mind.

In [109]:
x = data['I am _______.'].fillna(' ').astype(str)

In [110]:
param_grid = {
    'tfidfvectorizer__min_df': [3],#np.arange(1,11,1),
    'tfidfvectorizer__max_df': np.arange(.08, .12, .005),
    'xgbclassifier__n_estimators': [100],
    'xgbclassifier__max_depth': np.arange(2,7,1)
}

kf = StratifiedKFold(n_splits=3)

pl = make_pipeline(
    TfidfVectorizer(),
    XGBClassifier()
)

grid = GridSearchCV(pl, param_grid, cv=kf, scoring='roc_auc')\
.fit(x, y)

clf = grid.best_estimator_

clf

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.08, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])

In [111]:
# This model works well too

cv = cross_val_score(clf, x, y, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.7602010006901313, 0.00788388208725665)

In [112]:
words = {}

for word, num in clf.named_steps['tfidfvectorizer'].vocabulary_.items():
    prob = clf.predict_proba([word])[0][1]
    words[word] = prob

df_topwords = pd.DataFrame([words]).transpose()
df_topwords.columns = ['probability']

df_topwords.sort_values('probability', ascending=False, inplace=True)

In [113]:
df_mode = mode(df_topwords['probability'])[0][0]

print()
print('Default prediction for this model is:', df_mode)
print()

df_topwords[df_topwords.probability != df_mode]


Default prediction for this model is: 0.007261719089001417



Unnamed: 0,probability
patient,0.081004
over,0.076149
woman,0.073762
hungry,0.033324
determined,0.022406
focused,0.011084
helpful,0.010576
11,0.010411
going,0.009664
being,0.009604


# How would you introduce yourself to a classmate?

I told them to write this in a way that emphasized their personality. Swearing and slang were acceptable.

In [115]:
x = data['Your "introduction"'].fillna(' ').astype(str)

In [116]:
param_grid = {
    'tfidfvectorizer__min_df': [3],#np.arange(1,11,1),
    'tfidfvectorizer__max_df': np.arange(.08, .12, .005),
    'xgbclassifier__n_estimators': [100],
    'xgbclassifier__max_depth': np.arange(2,7,1)
}

kf = StratifiedKFold(n_splits=3)

pl = make_pipeline(
    TfidfVectorizer(),
    XGBClassifier()
)

grid = GridSearchCV(pl, param_grid, cv=kf, scoring='roc_auc')\
.fit(x, y)

clf = grid.best_estimator_

clf

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.085, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])

In [117]:
# This model works well too

cv = cross_val_score(clf, x, y, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.7533643892339544, 0.06730802105852025)

In [118]:
words = {}

for word, num in clf.named_steps['tfidfvectorizer'].vocabulary_.items():
    prob = clf.predict_proba([word])[0][1]
    words[word] = prob

df_topwords = pd.DataFrame([words]).transpose()
df_topwords.columns = ['probability']

df_topwords.sort_values('probability', ascending=False, inplace=True)

In [119]:
df_mode = mode(df_topwords['probability'])[0][0]

print()
print('Default prediction for this model is:', df_mode)
print()

df_topwords[df_topwords.probability != df_mode]


Default prediction for this model is: 0.014791909605264664



Unnamed: 0,probability
appreciate,0.205342
big,0.103176
re,0.052137
thought,0.044312
spent,0.033475
world,0.028655
having,0.028361
never,0.025057
after,0.023248
listening,0.022401


**Movies! There it is making an appearance.**

# What you don't like

In [122]:
x = data['10 things you DON\'T like'].fillna(' ').astype(str)

In [123]:
param_grid = {
    'tfidfvectorizer__min_df': [3],#np.arange(1,11,1),
    'tfidfvectorizer__max_df': np.arange(.08, .12, .005),
    'xgbclassifier__n_estimators': [100],
    'xgbclassifier__max_depth': np.arange(2,7,1)
}

kf = StratifiedKFold(n_splits=3)

pl = make_pipeline(
    TfidfVectorizer(),
    XGBClassifier()
)

grid = GridSearchCV(pl, param_grid, cv=kf, scoring='roc_auc')\
.fit(x, y)

clf = grid.best_estimator_

clf

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.10000000000000002, max_features=None,
        min_df=3, ngram_range=(1, 1), norm='l2', preprocesso...=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1))])

In [124]:
# This model works works really well but has high variance

cv = cross_val_score(clf, x, y, cv=kf, scoring='roc_auc')

cv.mean(), cv.std()

(0.7837301587301587, 0.11179491851774788)

In [125]:
words = {}

for word, num in clf.named_steps['tfidfvectorizer'].vocabulary_.items():
    prob = clf.predict_proba([word])[0][1]
    words[word] = prob

df_topwords = pd.DataFrame([words]).transpose()
df_topwords.columns = ['probability']

df_topwords.sort_values('probability', ascending=False, inplace=True)

In [126]:
df_mode = mode(df_topwords['probability'])[0][0]

print()
print('Default prediction for this model is:', df_mode)
print()

df_topwords[df_topwords.probability != df_mode]


Default prediction for this model is: 0.008510957472026348



Unnamed: 0,probability
late,0.363562
bad,0.181727
math,0.142716
when,0.044622
crowds,0.04311
their,0.033495
with,0.024671
running,0.020442
movies,0.013429
food,0.013396


In [128]:
# Maybe smart people specifically don't like bad movies
clf.predict_proba(['bad movies'])[0][1]

0.20233338

It also looks like the below-average respondents emphasized more sensory, tangible things they don't like (spiders, hot ____, cold ____. Maybe this is because my "intelligence" questions are really measuring openness to experience, which is a measure of idea fluency.