In [None]:
import os
import re
import csv
import codecs
import operator

import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [None]:
def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    if len(text) == 0:
        return 0
    else:
        return len(re.findall(regexp, text)) / len(text)

In [None]:
files = sorted([f for f in listdir('for_ensemble/') if isfile(join('for_ensemble/', f))])
fixed = pd.read_csv('for_ensemble/' + files[0])
df = pd.DataFrame()
df['id'] = fixed['id']
train = pd.read_csv('../Dataset/train.csv')
train = train.merge(df, on='id')

label = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
test = pd.read_csv('../Dataset/test.csv')

In [None]:
glove_index = set()
fasttext_index = set()

F_EMBEDDING_FILE='../features/crawl-300d-2M.vec'
G_EMBEDDING_FILE='../features/glove.840B.300d.txt'

f = open(F_EMBEDDING_FILE, 'r', encoding='utf-8')
for line in f:
    values = line.split()
    try:
        fasttext_index.add(values[0])
    except:
        print("Err on ", values[:3])
f.close()

f = open(G_EMBEDDING_FILE, 'r', encoding='utf-8')
for line in f:
    values = line.split()
    try:
        glove_index.add(values[0])
    except:
        print("Err on ", values[:3])
f.close()


def count_unknown_glove(t):
    t = t.split()
    res = 0
    for w in t:
        if w not in glove_index:
            res += 1
    return res
    
def count_unknown_fasttext(t):
    t = t.split()
    res = 0
    for w in t:
        if w not in fasttext_index:
            res += 1
    return res 

In [None]:
twitter_features = pd.read_csv('../features/twitterINFO.csv')

In [None]:
twitter_features.head()

In [None]:
train_twitter = twitter_features[twitter_features.type==0]
test_twitter= twitter_features[twitter_features.type==1]

In [None]:
train_twitter = train_twitter.drop(['type'], axis=1)
test_twitter = test_twitter.drop(['type'], axis=1)

In [None]:
train = train.merge(train_twitter, on='id')
test = test.merge(test_twitter, on='id')

In [None]:
train.tail()

In [None]:
cl_path = 'features/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

In [None]:
from collections import defaultdict
# Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^?!.,:a-z\d ]',re.IGNORECASE)

# regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)
word_count_dict = defaultdict(int)
import re

def clean_text(text, remove_stopwords=False, stem_words=False, count_null_words=True, clean_wiki_tokens=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    # dirty words
    return text
    #text = special_character_removal.sub('',text)

In [None]:
def create_meta_feature(df):
    df['clean_text'] = df['comment_text'].apply(lambda t: clean_text(t))
    df['total_length'] = df['comment_text'].apply(len)
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                    axis=1)
    df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))
    df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))
    df['num_punctuation'] = df['comment_text'].apply(
        lambda comment: sum(comment.count(w) for w in '.,;:'))
    df['num_symbols'] = df['comment_text'].apply(
        lambda comment: sum(comment.count(w) for w in '*&$%'))
    df['num_words'] = df['comment_text'].apply(lambda comment: len(comment.split()))
    df['num_unique_words'] = df['comment_text'].apply(
        lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
    df['num_smilies'] = df['comment_text'].apply(
        lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
    # Check number of upper case, if you're angry you may write in upper case
    # Number of F words - f..k contains folk, fork,
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    # Number of S word
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    # Number of D words
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    # Number of occurence of You, insulting someone usually needs someone called : you
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    # Just to check you really refered to my mother ;-)
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    # Just checking for toxic 19th century vocabulary
    df["nb_ng"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))
    # Some Sentences start with a <:> so it may help
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    # Check for time stamp
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Check for date short 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    # Check for http links
    df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # check for mail
    df["has_mail"] = df["comment_text"].apply(
        lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x)
    )
    df["has_image"] = df["comment_text"].apply(
        lambda x: count_regexp_occ(r'image\:', x)
    )
    
    df["has_ip"] = df["comment_text"].apply(lambda x: count_regexp_occ("(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", x))
    # Looking for words surrounded by == word == or """" word """"
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))

    df["has_star"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\*", x))
    df["unknown_glove"] = df['clean_text'].apply(lambda x: count_unknown_glove(x))
    df["unknown_fasttext"] = df['clean_text'].apply(lambda x: count_unknown_fasttext(x))
    df["unknown_glove_fasttext"] = df["unknown_glove"] + df["unknown_fasttext"]
print("Creating meta features")
create_meta_feature(train)
create_meta_feature(test)
print("Created.")

train = train.drop(['comment_text'], axis=1)
test = test.drop(['comment_text'], axis=1)

In [None]:
test.columns.values

## Train data

In [None]:
print("Loading Train Data")
data_path = 'for_ensemble/'
train_files = sorted([f for f in listdir(data_path) if isfile(join(data_path, f))])
datas= []
print(train_files)
for file in train_files:
    # train_data1 = pd.read_csv(data_path + '/charCNN/kmax11-cnn-1112-1400-Train-L0.044465-A0.982130.csv')
    datas.append(pd.read_csv(data_path + file))

print("Finish Train Data loading")

In [None]:
for i, data in enumerate(datas):
    print(i, data.isnull().sum().sum())
    if i == 0:
        new_data = data
    else : 
        new_data  = new_data.merge(data, on='id', how='left')
        print(i, new_data.isnull().sum().sum())

### The below part is for the bug of less predict number. * only do once

In [None]:
column_numbers = new_data.shape[1]
toxic = new_data.iloc[:, [ i for i in range(1,column_numbers, 6)]] 
severe_toxic = new_data.iloc[:, [ i for i in range(2,column_numbers, 6)]]
obscene = new_data.iloc[:, [ i for i in range(3,column_numbers, 6)]]
threat = new_data.iloc[:, [ i for i in range(4,column_numbers, 6)]]
insult = new_data.iloc[:, [ i for i in range(5,column_numbers, 6)]]
identity_hate = new_data.iloc[:, [ i for i in range(6,column_numbers, 6)]]

print(toxic.shape)
print(severe_toxic.shape)
print(obscene.shape)
print(threat.shape)
print(insult.shape)
print(identity_hate.shape)

In [None]:
for col in ['total_length', 'capitals', 'caps_vs_length',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
       'num_smilies', 'ant_slash_n', 'nb_fk', 'nb_sk', 'nb_dk', 'nb_you',
       'nb_mother', 'nb_ng', 'start_with_columns', 'has_timestamp',
       'has_date_long', 'has_date_short', 'has_http', 'has_mail',
       'has_ip', 'has_emphasize_equal', 'has_emphasize_quotes',
       'has_star', 'unknown_glove', 'unknown_fasttext',
       'unknown_glove_fasttext', 'twitter_prob']:
    toxic[col] = train[col]
    severe_toxic[col] = train[col]
    obscene[col] = train[col]
    threat[col] = train[col]
    insult[col] = train[col]
    identity_hate[col] = train[col]

In [None]:
del new_data

## Training function / Load ensemble_results


In [None]:
print("Loading Data")
data_path = 'ensemble_results/'
test_datas= []
files = sorted([f for f in listdir(data_path) if isfile(join(data_path, f))])
print(files)

for file in files:
    test_datas.append(pd.read_csv(data_path + file))

print("Finish loading")

In [None]:
len(test_datas)

In [None]:
for i, data in enumerate(test_datas):
    
    print(i, data.isnull().sum().sum())
    if i == 0:
        new_data = data
    else : 
        new_data  = new_data.merge(data, on='id', how='left')
        print(i, new_data.isnull().sum().sum())

In [None]:
column_numbers = new_data.shape[1]
test_toxic = new_data.iloc[:, [ i for i in range(1,column_numbers, 6)]]
test_severe_toxic = new_data.iloc[:, [ i for i in range(2,column_numbers, 6)]]
test_obscene = new_data.iloc[:, [ i for i in range(3,column_numbers, 6)]]
test_threat = new_data.iloc[:, [ i for i in range(4,column_numbers, 6)]]
test_insult = new_data.iloc[:, [ i for i in range(5,column_numbers, 6)]]
test_identity_hate = new_data.iloc[:, [ i for i in range(6,column_numbers, 6)]]


In [None]:
for col in ['total_length', 'capitals', 'caps_vs_length',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
       'num_smilies', 'ant_slash_n', 'nb_fk', 'nb_sk', 'nb_dk', 'nb_you',
       'nb_mother', 'nb_ng', 'start_with_columns', 'has_timestamp',
       'has_date_long', 'has_date_short', 'has_http', 'has_mail',
       'has_ip', 'has_emphasize_equal', 'has_emphasize_quotes',
       'has_star', 'unknown_glove', 'unknown_fasttext',
       'unknown_glove_fasttext', 'twitter_prob']:
    test_toxic[col] = test[col]
    test_severe_toxic[col] = test[col]
    test_obscene[col] = test[col]
    test_threat[col] = test[col]
    test_insult[col] = test[col]
    test_identity_hate[col] = test[col]
    

In [None]:
test_datas = [test_toxic, test_severe_toxic, test_obscene, test_threat, test_insult, test_identity_hate]

In [None]:
test_toxic.shape

In [None]:
toxic.shape

In [None]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
def fit_every_feature_model(feature_data, label, feature_name, feature_test_data, predict = False):
    predictions = np.zeros(shape=[len(feature_test_data)])
    fold_size = len(feature_data) // fold_count

    the_label = label[feature_name].values
    print("Feature name: ", feature_name)
    auc = 0
    for fold_id in range(0, fold_count):
        print("Fold : ", fold_id)
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size
        if fold_id == fold_size - 1:
            fold_end = len(X)
        
        train_x = np.concatenate([feature_data[:fold_start], feature_data[fold_end:]])
        train_y = np.concatenate([the_label[:fold_start], the_label[fold_end:]])

        val_x = feature_data[fold_start:fold_end]
        val_y = the_label[fold_start:fold_end]
        
        
        lgb_train = lgb.Dataset(train_x, train_y)
        lgb_val = lgb.Dataset(val_x, val_y)
        
        lgbm_model = lgb.LGBMClassifier(max_depth=5, metric="auc", n_estimators=10000,
                                   num_leaves=32, boosting_type="gbdt",
                                        learning_rate=0.01, feature_fraction=0.3,
                                   bagging_fraction=0.8, bagging_freq=5, reg_lambda=0)
        lgbm_model.fit(X=train_x, y=train_y,eval_metric=['auc','binary_logloss'],
                           eval_set =(val_x, val_y),
                          early_stopping_rounds=1000, verbose=500)
        auc += lgbm_model.best_score_['valid_0']['auc']
        #auc += lgbm_model.best_score_['valid_0']['auc']
        lgb.plot_importance(lgbm_model, max_num_features=30)
        plt.show()
        if predict==True:
            prediction = lgbm_model.predict_proba(feature_test_data)
            predictions += prediction[:,1]
            del lgbm_model
    predictions /= fold_count   
    print("Training  Finish")

    return predictions, auc / fold_count


In [None]:
fold_count = 5
all_auc = []
predictions = []
for i,feature_name in enumerate(label_columns):
    prediction, auc = fit_every_feature_model(train_datas[i], label, feature_name, test_datas[i] ,predict = True)
    all_auc.append(auc)
    predictions.append(prediction)
print("Overall AUC", sum(all_auc) / 6)
print("Each AUC", all_auc)

Training  Finish
Overall AUC 0.9918501556540616
Each AUC [0.9890730337146506, 0.9920222918771883, 0.9955802803707211, 0.9932634005508755, 0.9901046133930338, 0.9910573140179008]

Overall AUC 0.991772595736773
Each AUC [0.9891636022238185, 0.9922094108617353, 0.9955908009685464, 0.9928002542080963, 0.9902056092164748, 0.9906658969419672]

Overall AUC 0.992402256021542
Each AUC [0.9895523364509066, 0.9924790697348147, 0.9957228439902718, 0.9942676419613532, 0.9903821148935895, 0.9920095290983164]

dart Overall AUC 0.9918463306382743
Each AUC [0.9895559477581786, 0.9918782844225191, 0.9956426464701587, 0.9919448445004209, 0.9903768557584822, 0.9916794049198863]

Overall AUC 0.9924422528049991
Each AUC [0.9895998532864534, 0.9925303930007228, 0.9957953188302582, 0.9936798954670287, 0.9905025802713494, 0.9925454759741816]

Overall AUC 0.9927553975997196
Each AUC [0.9896575066913226, 0.9925319108904613, 0.9958337731585984, 0.9951189568519518, 0.9906241195916726, 0.9927661184143105]

Overall AUC 0.9928417266022279
Each AUC [0.9898409916016264, 0.9926231321876025, 0.9959125410608889, 0.9950726242194354, 0.9907445504706871, 0.9928565200731265]

Overall AUC 0.9927522465565181
Each AUC [0.9898409027947984, 0.9926108135621181, 0.9959050092189476, 0.9944645254713077, 0.9907610116240857, 0.99293121666785]

Overall AUC 0.9928923711203441
Each AUC [0.98983781123029, 0.9926462987999607, 0.9959119831639403, 0.9952592506885705, 0.9907596664543711, 0.9929392163849318]

Overall AUC 0.991030175791376
Each AUC [0.9878416437898958, 0.990220606684348, 0.9953303378054033, 0.9924730357089938, 0.9896683886147624, 0.990647042144853]

Overall AUC 0.9926881408536721
Each AUC [0.9898203113911324, 0.992560373593184, 0.9958975654820794, 0.9944830495431566, 0.9907448846625799, 0.9926226604499003]

## Submission


In [None]:
subm  = pd.read_csv('Dataset/sample_submission.csv')
for i,feature_name in enumerate(label_columns):
    subm[feature_name] = predictions[i]
subm.to_csv('LGBM_RANDR_99289_F5.csv', index=False)