In [14]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
import lightgbm as lgb

In [3]:
PATH = '../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')


train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']

text = train_sentence

text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)

(159571, 30)
(153164, 24)


In [54]:


phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')

print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)


print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)


train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 162813546 stored elements in Compressed Sparse Row format>

In [57]:
# from sklearn.model_selection import train_test_split
# x_train, x_val, y_train_df, y_val_df = train_test_split(train_tfidf, train, test_size=0.33)
# # Split the dataset



# Split the dataset
split_index = round(len(train) * 0.999) #################################
shuffled_train = train#.sample(frac=1)
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]
#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf


# train toxic
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)
r_dict = {label: np.log(pr(1, y_train_df[label].values, x_train) / pr(0,  y_train_df[label].values, x_train)) for label in label_cols}

In [58]:
train_set = {label: x_train.multiply(r_dict[label]).tocsr() for label in r_dict }
val_set = {label: x_val.multiply(r_dict[label]).tocsr() for label in r_dict }
test_set = {label: x_test.multiply(r_dict[label]).tocsr() for label in r_dict }

import gc
gc.collect()


247

In [49]:
for 
lgb_train_set = lgb.Dataset(train_set['toxic'], y_train_df['toxic'].values)
lgb_eval_set = lgb.Dataset(val_set['toxic'], y_val_df['toxic'].values, reference=lgb_train_set)


# params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': {'binary', 'auc'},
#     'learning_rate': 0.2,
#     'num_iterations': 100,
#     'num_leaves': 171,
#     'device': 'cpu',
#     'num_threads': 24,
#     'max_depth': -1,
# #     'min_data_in_leaf': 5,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 1,
#     'feature_fraction': 0.8,
#     'lambda_l1': 1,
#     'lambda_l2': 1}


params = {
    'learning_rate': 0.2,
    'application': 'binary',
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'auc',
    'data_random_seed': 2,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'nthread': 4,
    'lambda_l1': 1,
    'lambda_l2': 1
} 

gbm = lgb.train(params,
                lgb_train_set,
                valid_sets=lgb_eval_set,
               verbose_eval=10)





[10]	valid_0's auc: 0.955896
[20]	valid_0's auc: 0.9664
[30]	valid_0's auc: 0.972706
[40]	valid_0's auc: 0.974617
[50]	valid_0's auc: 0.975766
[60]	valid_0's auc: 0.975818
[70]	valid_0's auc: 0.975509
[80]	valid_0's auc: 0.975836
[90]	valid_0's auc: 0.97558
[100]	valid_0's auc: 0.975754


In [50]:
from sklearn.metrics import roc_auc_score
y = y_val_df['toxic']
pred = gbm.predict(val_set['toxic'])
print('accuracy is {}'.format(roc_auc_score(y,pred)))

accuracy is 0.9757536846018412


In [51]:
inv_char = {v: k for k, v in char_vectorizer.vocabulary_.items()}
inv_phrase = {v: k for k, v in phrase_vectorizer.vocabulary_.items()}
features = []
features_char = [inv_char[i] for i in range(0,train_char.shape[1] )]
features_phrase = [inv_phrase[i] for i in range(0,train_phrase.shape[1])]
features.extend(features_char)
features.extend(features_phrase)

In [52]:
df_importance = pd.DataFrame()
df_importance['score'] = pd.Series(gbm.feature_importance()) 
df_importance['feature'] = pd.Series(features)

In [53]:
df_importance.sort_values(by=['score'], ascending=False)

Unnamed: 0,score,feature
4,51,!
218188,46,cleaned_neutral_0
59935,45,ck
238597,40,hell
168584,38,sex
13651,36,me
176282,32,tard
259985,31,original_neutral_0
1050,31,)
169118,29,shit


In [None]:
##############################
lgb_train_set = {}
lgb_eval_set = {}
gbm_model = {}
y_pred = pd.DataFrame()
y_pred['id'] = test['id']
for col in label_cols:
    # create dataset for lightgbm


    params = {
    'learning_rate': 0.2,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary', 'auc'},
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'auc',
    'data_random_seed': 2,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'nthread': 4,
    'lambda_l1': 1,
    'lambda_l2': 1
    } 

    print(col)
    lgb_train_set[col] = lgb.Dataset(train_set[col], y_train_df[col].values)
    lgb_eval_set[col] = lgb.Dataset(train_set[col], y_train_df[col].values, reference=lgb_train_set[col])
    

    gbm_model[col] = lgb.train(params,
                        lgb_train_set[col],
                        valid_sets=lgb_eval_set[col],
                       verbose_eval=20)
    y_pred[col] =  gbm_model[col].predict(test_set[col])

toxic
[20]	valid_0's auc: 0.961945
[40]	valid_0's auc: 0.980012
[60]	valid_0's auc: 0.986639
[80]	valid_0's auc: 0.99028
[100]	valid_0's auc: 0.99275
severe_toxic
[20]	valid_0's auc: 0.98983
[40]	valid_0's auc: 0.996956


In [None]:
y_pred.to_csv(PATH + 'lgbm_nb_tfidf.csv', index=False)