#Importing libraries


In [None]:
from sklearn.model_selection import KFold
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import pandas as pd
import numpy as np
import pickle

import nltk
import re

#Class for cleaning data

In [None]:
class TweetCleaner():
    def __init__(self):
        pass


    def _scrub_words(self, text):

        text=re.sub("(<.*?>)","",text)
        text=re.sub("(\\W|\\d)"," ",text)
        text=text.strip()

        return text
  

    def _cleanString(self, s, special_chars = "\":,.|#ðÿœžðÿâœœïÿœžÿºÿÿœžÿ"):

        web_regex =  r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        s = re.sub('(?<!\w)@[\w+]{1,15}', '', s)
        s = re.sub(web_regex, '', s)
        
        for char in special_chars:

            s = s.replace(char, "")
            s = s.replace("\n", "")
            s = self._scrub_words(s)

        tokenizer = TweetTokenizer()

        stop_words = set(stopwords.words('english'))
        cleaned_words = [w for w in tokenizer.tokenize(s) if w not in stop_words]
        
        return " ".join(cleaned_words)

    def _stemWords(self, sentence):
        stemmer, tokenizer = PorterStemmer(), TweetTokenizer()
        stemmed_words = [stemmer.stem(w) for w in tokenizer.tokenize(sentence)]
        
        return " ".join(stemmed_words)
    
    def cleanFrame(self, frame):
        frame['clean_paper'] = frame.comment_text.apply(self._cleanString)

    def stemFrame(self, frame):
        frame['stem_paper'] = frame.clean_paper.apply(self._stemWords)

    
    def rebrand_senti(self, senti):
      
        if senti != 0:
            senti = 1

        return senti

        
    

# Reading csv

In [None]:
a = pd.read_csv('/content/train.csv')
a.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


#Balancing data

In [None]:
df.toxic.unique()

array([0, 1])

In [None]:
df.toxic.value_counts()

0    148543
1     15767
Name: toxic, dtype: int64

In [None]:
pos = df[df.toxic==0]
neg = df[df.toxic==1]

size = neg.shape[0]

In [None]:
pos_bal = pos.sample(len(neg))
len(pos_bal)

15767

In [None]:
pos_bal.toxic.unique()

array([0])

In [None]:
df = pd.concat([pos_bal, neg])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,comment_text,toxic
0,"""\n\n Please do not vandalize pages, as you di...",0
1,"""::Yes, lets pretend it was just """"general cha...",0
2,"Well, it's also almost 9:00am here, and I have...",0
3,Obsessive-compulsive_disorder 76.166.31.163,0
4,it would be informative for me to add it,0
...,...,...
31529,"""\n\n our previous conversation \n\nyou fuckin...",1
31530,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1
31531,Your absurd edits \n\nYour absurd edits on gre...,1
31532,"""\n\nHey listen don't you ever!!!! Delete my e...",1


In [None]:
df.toxic.value_counts()

1    15767
0    15767
Name: toxic, dtype: int64

In [None]:
df.columns

Index(['comment_text', 'toxic'], dtype='object')

# Inicializating cleaner

In [None]:

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
clr = TweetCleaner()
clr.cleanFrame(df)
clr.stemFrame(df)

#df.senti = df.senti.apply(clr.rebrand_senti)
df.head()

Unnamed: 0,comment_text,toxic,clean_paper,stem_paper
0,"""\n\n Please do not vandalize pages, as you di...",0,Please vandalize pages edit Timeline evolution...,pleas vandal page edit timelin evolut If conti...
1,"""::Yes, lets pretend it was just """"general cha...",0,Yes lets pretend general chatter like talking ...,ye let pretend gener chatter like talk weather...
2,"Well, it's also almost 9:00am here, and I have...",0,Well also almost I even morning coffee yet Tha...,well also almost I even morn coffe yet thank n...
3,Obsessive-compulsive_disorder 76.166.31.163,0,Obsessive compulsive_disorder,obsess compulsive_disord
4,it would be informative for me to add it,0,would informative add,would inform add


# Preprocessing for input data for the model

In [None]:
def get_vocab(df):
    tweets  = ' '.join(list(df.stem_paper))
    V = []
    stop_words = set(stopwords.words('english'))
    for word in tweets.split():
        if (word not in V) and (word not in stop_words): 
            V.append(word)
    return V

V = get_vocab(df)
print(len(V))

48415


In [None]:
def word_frequencies(df, V):
    pos_fr = dict.fromkeys(V, 0)
    neg_fr = dict.fromkeys(V, 0)

    for i in range(df.shape[0]):
        word_list = (df.iloc[i].stem_paper).split(' ')
        sent = df.iloc[i].toxic
        for word in word_list:
            if word in V:
                if sent == 0:
                    neg_fr[word] += 1
                else:
                    pos_fr[word] += 1
                    
    return (pos_fr, neg_fr)

In [None]:
pos_dict, neg_dict = word_frequencies(df,V)

In [None]:
def transform_data(df, pos_dict, neg_dict, V):
    new_df = {'bias':[], 'positivity':[], 'negativity':[], 'target_senti':[]}
    stemed_sentences = df.stem_paper
   
    for sentence in stemed_sentences:
        positivity = 0
        negativity = 0
        words = sentence.split()
        #count positivity
        for word in words:
            if word in V:
                positivity += pos_dict[word]
                negativity += neg_dict[word]

        new_df['bias'].append(1)
        new_df['positivity'].append(positivity)
        new_df['negativity'].append(negativity)
        new_df['target_senti'] = df.toxic

    return pd.DataFrame(new_df)    

    

In [None]:
trans_df = transform_data(df, pos_dict, neg_dict, V)
trans_df.head()

Unnamed: 0,bias,positivity,negativity,target_senti
0,1,13043,23433,0
1,1,6528,8911,0
2,1,25558,32581,0
3,1,63,29,0
4,1,1879,5467,0


# Splitting by Kfold

In [None]:
kfold = KFold(n_splits = 5, shuffle = True)

In [None]:
trans_df.iloc[['1','2']]

Unnamed: 0,bias,positivity,negativity,target_senti
1,1,6528,8911,0
2,1,25558,32581,0


# Defining logistic regression model

### Accurancy of the model

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix

X = trans_df[['bias', 'positivity', 'negativity']]
Y = trans_df.target_senti
avg = 0
avg_f1 = 0
avg_rec = 0

for train_index, test_index in kfold.split(trans_df):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    sk_model = LogisticRegression().fit(X_train,Y_train)
    pred = sk_model.predict(X_test)
    score = accuracy_score(Y_test, pred)
    recall = recall_score(Y_test, pred)
    f1 = f1_score(Y_test, pred)
    
    avg += score
    avg_f1 += f1
    avg_rec += recall
    
    print('Acc: {}, F1: {}, Recall: {}'.format(score, f1, recall))
    print('Confusion matrix: ')
    print(confusion_matrix(Y_test,pred))

m = kfold.get_n_splits()

print('Avg Acc: {} ,Avg  F1: {}, Avg Recall: {}'.format(avg/m, avg_f1/m, avg_rec/m))

Acc: 0.8119549706675123, F1: 0.8093860495017678, Recall: 0.7871209753047828
Confusion matrix: 
[[2603  505]
 [ 681 2518]]
Acc: 0.8141747264943713, F1: 0.8088714938030006, Recall: 0.7925854905720677
Confusion matrix: 
[[2655  523]
 [ 649 2480]]
Acc: 0.8056128111622007, F1: 0.7974223397224057, Recall: 0.7731496315283563
Confusion matrix: 
[[2668  518]
 [ 708 2413]]
Acc: 0.8149674964325353, F1: 0.8097799511002445, Recall: 0.789574062301335
Confusion matrix: 
[[2656  505]
 [ 662 2484]]
Acc: 0.8108150967332699, F1: 0.8075496047749636, Recall: 0.7890920554854981
Confusion matrix: 
[[2610  524]
 [ 669 2503]]
Avg Acc: 0.8115050202979779 ,Avg  F1: 0.8066018877804766, Avg Recall: 0.7863044430384079


# Downloading model trained

In [None]:
from joblib import dump

dump(sk_model, 'modeLr.joblib')

['modeLr.joblib']

In [None]:
# save the model to disk
filename = 'modeLr.sav'
pickle.dump(sk_model, open(filename, 'wb'))
