In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
stop_words = set(stopwords.words('english'))

train = pd.read_csv('train.csv', index_col=0)
# test = pd.read_csv('test.csv')
# test_labels = pd.read_csv('test_labels.csv')

X = train.iloc[:, 0]
y = train.iloc[:, 1:]

In [3]:
NB_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('mnb', OneVsRestClassifier(BernoulliNB()))
    ])

SVM_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('clf', OneVsRestClassifier(LinearSVC()))
    ])

LR_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

RF_pl = Pipeline([
    ('vectorizer', CountVectorizer(binary=True)),
    ('clf', OneVsRestClassifier(RandomForestClassifier()))
    ])

In [4]:
kf = KFold(n_splits=5, shuffle=True)

scores_NB = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values

    NB_pl.fit(X_train, y_train)
    y_pred = NB_pl.predict_proba(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_NB.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_NB)) )

0.9108650922921551
0.9062581207293148
0.9092464197534357
0.9102886713017654
0.9073835532241544
MEAN: 0.908808371460165


In [36]:
kf = KFold(n_splits=5, shuffle=True)

scores_SVC = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values
    
    SVM_pl.fit(X_train, y_train)
    y_pred = SVC_pl.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_SVC.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_SVC)) )

0.7266420946935822
0.7385436119457728
0.725363603271547
0.7334646453111482
0.719995488907276
MEAN: 0.7288018888258653


In [37]:
"""
kf = KFold(n_splits=5, shuffle=True)

scores_LR = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values
    
    LR_pl.fit(X_train, y_train)
    y_pred = LR_pl.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_LR.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_LR)) )
"""

"\nkf = KFold(n_splits=5, shuffle=True)\n\nscores_LR = []\n\nfor train_idx, test_idx in kf.split(X):\n    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values\n    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values\n    \n    LR_pl.fit(X_train, y_train)\n    y_pred = LR_pl.predict(X_test)\n    score = roc_auc_score(y_test, y_pred)\n    scores_LR.append(score)\n    print(score)\n    \nprint('MEAN: {}'.format(np.mean(scores_LR)) )\n"

In [38]:
kf = KFold(n_splits=5, shuffle=True)

scores_RF = []

for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx].values, y.iloc[train_idx, :].values
    X_test, y_test = X.iloc[test_idx].values, y.iloc[test_idx, :].values
    
    RF_pl.fit(X_train, y_train)
    y_pred = SVC_pl.predict(X_test)
    score = roc_auc_score(y_test, y_pred)
    scores_RF.append(score)
    print(score)
    
print('MEAN: {}'.format(np.mean(scores_RF)) )

0.876977552657587
0.882698106051695
0.875970105689273
0.8745050437204466
0.8725853222590959
MEAN: 0.8765472260756194
