In [3]:
import pandas as pd
import string
import re
import random
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize, TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from nltk.stem.porter import PorterStemmer

In [4]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [5]:
test_labels_df = pd.read_csv("test_labels.csv")
holdout = holdout.merge(test_labels_df, on='id')

In [6]:
tknzr = TweetTokenizer(preserve_case=False, reduce_len=True)
tokens = [tknzr.tokenize(x) for x in train['comment_text']]
tokens_test = [tknzr.tokenize(x) for x in holdout['comment_text']]

In [7]:
porter = PorterStemmer()
list_of_stems_lists = [[porter.stem(token) for token in tokens] for tokens in tokens]
test_list_of_stems_lists = [[porter.stem(token) for token in tokens] for tokens in tokens_test]

In [8]:
stems_for_tfidf = list(map(' '.join, list_of_stems_lists))
test_stems_for_tfidf = list(map(' '.join, test_list_of_stems_lists))

In [9]:
X_train2 = stems_for_tfidf
y_train2 = train.iloc[:,2:]

X_test2 = test_stems_for_tfidf
y_test2 = holdout.iloc[:,2:]

In [10]:
pipe_lr_final = make_pipeline(
    TfidfVectorizer(sublinear_tf=1, max_features=45000, strip_accents='ascii', ngram_range=(1,1)),
    OneVsRestClassifier(LogisticRegression(solver='sag', C=1.85))
)

In [11]:
pipe_lr_final.fit(X_train2, y_train2)
test_preds_final = pipe_lr_final.predict_proba(X_test2)

In [46]:
label_names = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']
submission_df = pd.DataFrame(test_preds_final, columns=label_names)

In [47]:
submission_df['id'] = holdout['id'].values
submission_df.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,id
0,0.999695,0.221466,0.998754,0.089041,0.98108,0.34729,00001cee341fdb12
1,0.002277,0.001155,0.001378,0.000279,0.003236,0.001937,0000247867823ef7
2,0.022186,0.003879,0.011608,0.000887,0.010403,0.002384,00013b17ad220c46
3,0.002624,0.001871,0.002047,0.000979,0.004389,0.000647,00017563c3f7919a
4,0.011206,0.001116,0.004114,0.000835,0.003968,0.001198,00017695ad8997eb


In [48]:
submission_df = submission_df[['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
submission_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999695,0.221466,0.998754,0.089041,0.98108,0.34729
1,0000247867823ef7,0.002277,0.001155,0.001378,0.000279,0.003236,0.001937
2,00013b17ad220c46,0.022186,0.003879,0.011608,0.000887,0.010403,0.002384
3,00017563c3f7919a,0.002624,0.001871,0.002047,0.000979,0.004389,0.000647
4,00017695ad8997eb,0.011206,0.001116,0.004114,0.000835,0.003968,0.001198


In [49]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 7 columns):
id               153164 non-null object
toxic            153164 non-null float64
severe_toxic     153164 non-null float64
obscene          153164 non-null float64
threat           153164 non-null float64
insult           153164 non-null float64
identity_hate    153164 non-null float64
dtypes: float64(6), object(1)
memory usage: 8.2+ MB


In [50]:
# submission_df.to_csv('submission.csv', index=False)