#### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from common import tokenize, eval_pred

#### Read Data

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
test_y = pd.read_csv('../data/test_labels.csv')
test_df = pd.concat([test_df, test_y.iloc[:,1:]], axis=1, sort=False)
print(train_df.shape, test_df.shape)

(159571, 8) (153164, 8)


#### Initialization

In [3]:
scores_tracker = {}
non_toxic_label = 'non_toxic'
comment_col = 'comment_text'

class_labels = train_df.columns.tolist()[2:]
class_labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
# Create non-toxic class and fillna
train_df[non_toxic_label] = 1 - train_df[class_labels].max(axis=1)
train_df[comment_col] = train_df[comment_col].fillna('unknown')
test_df[comment_col] = test_df[comment_col].fillna('unknown')

#### 1. Baseline Model - Logistic Regression
- With bag-of-features representation of text comments

##### Functions

In [5]:
def get_pred_lr(train_y, train_x,
                test_x,
                class_labels=class_labels,
                **kwargs):
    '''Get predictions for each label at a time'''
    # Initialize prediction output array
    preds = np.zeros((test_x.shape[0], len(class_labels)))

    # Get predictions for each label
    for idx, label in enumerate(class_labels):
        print('fit', label)
        m = LogisticRegression(**kwargs)
        m.fit(train_x, train_y[label])
        preds[:, idx] = m.predict_proba(test_x)[:, 1]
    return preds

def run_lr_base(vectorizer,
                train_df, test_df,
                comment_col=comment_col,
                class_labels=class_labels,
                **kwargs):
    '''Run 1 logistic regression prediction cycle'''
    
    # Transform data
    train_x = vectorizer.fit_transform(train_df[comment_col])
    test_x = vectorizer.transform(test_df[comment_col])
    train_y = train_df[class_labels]
    test_y = test_df[class_labels]
    
    # Get prediction and score
    preds = get_pred_lr(train_y, train_x,
                        test_x,
                        class_labels,
                        **kwargs)
    score = eval_pred(test_y, preds, class_labels)
    
    return score

##### RUN - Baseline

In [6]:
# Define vectorizers
cntvec = CountVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                        min_df=3, max_df=0.9, strip_accents='unicode')

In [7]:
%%time
# Fit, predict and get scores
score_lr_base = run_lr_base(
    cntvec, train_df, test_df,
    comment_col=comment_col,
    class_labels=class_labels,
    C=4, dual=False,
    max_iter=200,
    random_state=123, n_jobs=-1)
scores_tracker['lr_base'] = score_lr_base

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
Mean ROC-AUC: 0.938047333754659
Wall time: 11min 19s
