In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier, XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
combi = pd.concat([train, test])

In [3]:
y = np.zeros_like(train['toxic'])

y = np.where(train.toxic == 1, 'toxic', y)
y = np.where(train['severe_toxic'] == 1, 'severe_toxic', y)
y = np.where(train['obscene'] == 1, 'obscene', y)
y = np.where(train['threat'] == 1, 'threat', y)
y = np.where(train['insult'] == 1, 'insult', y)
y = np.where(train['identity_hate'] == 1, 'identity_hate', y)
y = np.where(y == '0', 'ok', y)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', MultinomialNB())
])

x = train['comment_text']

params = [
    {
        'tfidf__max_df': np.arange(.02,.21,.02),
        'tfidf__min_df': [2,3],
        'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
        'tfidf__norm': ['l2'],
    },
]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=params, scoring='accuracy')\
.fit(x, y)

model = grid.best_estimator_
print(model)
cv = cross_val_score(model, x, y, cv=4, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

pl = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classify', LogisticRegression())
])

x = train['comment_text']

params = [
    {
        'tfidf__max_df': np.arange(.01,.10,.01),
        'tfidf__min_df': [2,3,4],
        'tfidf__ngram_range': [(1,2)],
        'tfidf__norm': ['l2'],
    },
]

grid =\
GridSearchCV(pl, cv=3, n_jobs=-1, param_grid=params, scoring='accuracy')\
.fit(x, y)

model = grid.best_estimator_
print(model)
cv = cross_val_score(model, x, y, cv=4, scoring='accuracy')

print('Mean score:', cv.mean())
print('Std Dev:   ', cv.std())