In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
t_label = pd.read_csv('test_label.csv')

In [None]:
#train.head()
a_test = np.array(t_label)
a_test

In [None]:
length = train.Tweets.str.len()
length.mean(), length.std(), length.max()

In [None]:
length.hist();

In [None]:
trait_cols = [ 'Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neurotocism']
train['none'] = 1-train[trait_cols].max(axis=1)
train.describe()

In [None]:
len(train),len(test)

In [None]:
TWEET = 'Tweets'
train[TWEET].fillna("unknown", inplace=True)
test[TWEET].fillna("unknown", inplace=True)

In [None]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2),
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
train_term_doc = vec.fit_transform(train[TWEET])
test_term_doc = vec.transform(test[TWEET])

In [None]:
train_term_doc, test_term_doc

In [None]:
def pr(y_i, y):
    p = train_x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
train_x = train_term_doc
test_x = test_term_doc

In [None]:
def get_model(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = KNeighborsClassifier(n_neighbors=5)
    x_nb = train_x.multiply(r)
    return m.fit(x_nb, y), r

In [None]:
predicts = np.zeros((len(test), len(trait_cols)))

for i, t in enumerate(trait_cols):
    print('fit', t)
    m, r = get_model(train[t])
    predicts[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

In [15]:
preds = np.rint(predicts)
print("Hamming_loss:\t", metrics.hamming_loss(a_test, preds))
print("Precision:\t", metrics.precision_score(a_test, preds, average='macro'))
print("Recall:\t", metrics.recall_score(a_test, preds, average='micro'))
print("F1:\t", metrics.f1_score(a_test, preds, average='weighted'))
print("F_beta:\t", metrics.fbeta_score(a_test, preds, average='macro', beta=0.5))
accuracy = metrics.accuracy_score(a_test, preds)
print("The accuracy is " + str(accuracy *100) + "%")

Hamming_loss:	 0.009194618389236778
Precision:	 0.9786787513769537
Recall:	 0.9790703409160965
F1:	 0.9776214863602882
F_beta:	 0.9779264881044292
The accuracy is 96.89069378138755%


In [None]:
result = pd.DataFrame(predicts, columns = trait_cols)
result.to_csv('result.csv', index=False)