In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack

In [2]:
PATH = '../../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

(159571, 27)
(153164, 21)


In [3]:
# CountVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0,\
#                 min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

print('getting tfidf')
phrase_vectorizer = TfidfVectorizer(ngram_range=(1,1),
                                    strip_accents='unicode', 
                                    max_features=10000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(1,5), 
                                  strip_accents='unicode', 
                                  max_features=20000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)
print('transforming train char')
train_char = char_vectorizer.transform(train_sentence.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)
print('transforming test char')
test_char = char_vectorizer.transform(test_sentence.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)

train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting tfidf
fitting char
fitting phrase
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 128155325 stored elements in Compressed Sparse Row format>

In [4]:
def get_logreg_model(label_cols, train_features, train, test_features):
    preds = np.zeros((test.shape[0], len(label_cols)))
    train_preds = np.zeros((train.shape[0], len(label_cols)))
    for i, j in enumerate(label_cols):
        print('fit', j)
        y = train[j].values
        model = LogisticRegression()
        model.fit(train_features, y)
        preds[:, i] = model.predict_proba(test_features)[:, 1]
        train_preds[:, i] = model.predict_proba(train_features)[:, 1]
        print('accuracy is {}'.format(roc_auc_score(y, train_preds[:, i])))
    return preds, train_preds

def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
        submission = pd.read_csv(path + 'sample_train.csv')
        file_name = 'train_' + model_name
    else:
        submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = model_name
    submission[label_cols] = y_test
    submission.to_csv(path + model_name + '/' + file_name + '.csv', index=False)
    
print('done')

done


In [5]:
print('predicting')
y_test, y_train = get_logreg_model(label_cols, train_tfidf, train, test_tfidf)

print('total score is {}'.format(roc_auc_score(train[label_cols], y_train)))

print('saving files')
model_name = 'logreg'
save(model_name, y_test, label_cols, PATH)
save(model_name, y_train, label_cols, PATH, True)

print('done')

predicting
fit toxic
accuracy is 0.9894225439427881
fit severe_toxic
accuracy is 0.9946250456201989
fit obscene
accuracy is 0.9961013467998864
fit threat
accuracy is 0.9978195301519253
fit insult
accuracy is 0.9914185663276583
fit identity_hate
accuracy is 0.9946427630450696
total score is 0.9940049659812544
saving files
done
