In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from analyzer import Analyzer

In [2]:
analyzer = Analyzer('train')

In [3]:
train = pd.read_csv('./data/prep_train.csv')
val = pd.read_csv('./data/prep_val.csv')

In [4]:
train['sent_POS_text'].isnull().sum(), val['sent_POS_text'].isnull().sum()

(0, 1)

In [5]:
val.dropna(subset=['sent_POS_text'], inplace=True)

In [6]:
vectorizer = CountVectorizer(ngram_range=(1,2))

In [7]:
train['class'] = train['class'].map({'H': 0, 'M': 1})
val['class'] = val['class'].map({'H': 0, 'M': 1})

In [8]:
x_train = train['sent_POS_text'].values
x_val = val['sent_POS_text'].values

y_train = train['class'].values
y_val = val['class'].values

In [9]:
x_train_bow = vectorizer.fit_transform(x_train)
x_val_bow = vectorizer.transform(x_val)

In [9]:
lr = LogisticRegression(solver='liblinear')
svc = SVC()

In [49]:
cross_val_score(lr, x_train_bow, y_train, scoring='f1', cv=3)

array([0.59892826, 0.59500116, 0.59855875])

In [None]:
cross_val_score(svc, x_train_bow, y_train, scoring='f1', cv=3)

In [10]:
lr.fit(x_train_bow, y_train)
y_pred = lr.predict(x_val_bow)

In [11]:
print(classification_report(y_pred, y_val))

              precision    recall  f1-score   support

           0       0.59      0.60      0.59     10635
           1       0.60      0.59      0.60     10875

    accuracy                           0.59     21510
   macro avg       0.59      0.59      0.59     21510
weighted avg       0.59      0.59      0.59     21510



In [None]:
svc.fit(x_train_bow, y_train)
y_pred = svc.predict(x_val_bow)

In [None]:
print(classification_report(y_pred, y_val))

In [36]:
pd.DataFrame(np.c_[np.array(list(vectorizer.vocabulary_)), lr.coef_[0]], columns=['feature', 'coef']).sort_values(by='coef', ascending=False).head(50)

Unnamed: 0,feature,coef
266,prep grnd,1.465232113355292
66,verb adjf,1.4153137617452582
156,advb infn,1.1372719451764477
255,intj adjf,1.0749783837961238
60,adjs advb,0.9996312204827464
47,adjs,0.9824967144245648
74,npro verb,0.8426747653726017
330,comp grnd,0.8184050010522685
254,prtf prts,0.8008223545746783
26,adjf adjf,0.78925348906207


In [19]:
bow_train = pd.DataFrame(x_train_bow.toarray(), columns=vectorizer.vocabulary_.keys())
bow_train['class'] = train['class']

In [24]:
bow_train

Unnamed: 0,noun,adjf,prep,noun noun,noun adjf,adjf prep,prep adjf,adjf noun,none,prcl,...,pred prts,prts intj,intj prts,grnd intj,grnd pred,grnd comp,intj comp,pred numr,numr intj,class
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129061,6,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
129062,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
129063,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
129064,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
