# Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re
import string
import itertools as it
import pickle
import os
from  pathlib import Path

import nltk
from nltk.corpus import stopwords                  # module for stop words that come with NLTK
from nltk.stem.wordnet import WordNetLemmatizer    # module for lemmatization
from nltk import word_tokenize, pos_tag            # tokenization and Part of Speech tagging

nltk.download('stopwords') #stopwords used to preprocess the corpus

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stopwords_english = stopwords.words('english') # a list of English stopwords

# Lemmatizer = lemmatizer = WordNetLemmatizer()  # a method that returns the lemmatized form of word 
#                                                # ("was" => "be" - "rocks" => "rock")

# Import Data

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Cybertrolling_Project/Data/train/train.csv")
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Cybertrolling_Project/Data/test/test.csv')
test_label_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Cybertrolling_Project/Data/test_labels/test_labels.csv')

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [None]:
train_data.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [None]:
train_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


As we can see the training dataset contains :
* the comment ID
* the raw text
* the different categories of toxicity

In [None]:
# Let's check some comments
for i in range(10):
    print(train_data['comment_text'][i])
    print('---------------')

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
---------------
D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
---------------
Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
---------------
"
More
I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any prefere

In [None]:
#Let's check in the test.csv
test_data.head(10)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
5,0001ea8717f6de06,Thank you for understanding. I think very high...
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...
7,000247e83dcc1211,:Dear god this site is horrible.
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ..."
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...


Here we just have the ID's and comments with no classification

In [None]:
# We are going to use this dataset to define the accuracy of our models
test_label_data = test_label_data.loc[test_label_data['toxic']!=-1]

In [None]:
# Let's calculate the % of toxic comments
test_label_data.iloc[:,1:-1].sum(axis=0) / test_label_data.shape[0]

toxic           0.095189
severe_toxic    0.005736
obscene         0.057692
threat          0.003298
insult          0.053565
dtype: float64

In [None]:
# Let's group comments and classification with ID's
test = test_label_data.merge(test_data, on='id', how="inner")
test.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text
0,0001ea8717f6de06,0,0,0,0,0,0,Thank you for understanding. I think very high...
1,000247e83dcc1211,0,0,0,0,0,0,:Dear god this site is horrible.
2,0002f87b16116a7f,0,0,0,0,0,0,"""::: Somebody will invariably try to add Relig..."
3,0003e1cccfd5a40a,0,0,0,0,0,0,""" \n\n It says it right there that it IS a typ..."
4,00059ace3e3e9a53,0,0,0,0,0,0,""" \n\n == Before adding a new product to the l..."
5,000663aff0fffc80,0,0,0,0,0,0,this other one from 1897
6,000689dd34e20979,0,0,0,0,0,0,== Reason for banning throwing == \n\n This ar...
7,000844b52dee5f3f,0,0,0,0,0,0,|blocked]] from editing Wikipedia. |
8,00091c35fa9d0465,1,0,0,0,0,0,"== Arabs are committing genocide in Iraq, but ..."
9,000968ce11f5ee34,0,0,0,0,0,0,Please stop. If you continue to vandalize Wiki...


In [None]:
# Check if it worked
# We take id's from test df and we check comments in test_data
id = "000968ce11f5ee34"
test_data.loc[test_data["id"] == id]

Unnamed: 0,id,comment_text
22,000968ce11f5ee34,Please stop. If you continue to vandalize Wiki...


In [None]:
# Check if it worked
# We take id's from test df and we check comments in test_data
id = "000689dd34e20979"
test_data.loc[test_data["id"] == id]

Unnamed: 0,id,comment_text
17,000689dd34e20979,== Reason for banning throwing == \n\n This ar...


It worked

# Clean the corpus

In [None]:
# We define the list of punctuations we want to remove
# Note that we let ! in the corpus
# punc = '''()-[]{};:'"\,<>./?@#$%^&*_~'''

In [None]:
#Let''s define a function that preprocesses a text

def preprocess(corpus):
    
    '''
    From a string, make text lowercase, remove hyperlinks, punctuation, word containing numbers, stopwords.
    Input : a list of strings
    Output : a list of tokens stored in a generator (yield)
    '''

    for text in corpus:

        text = text.lower()                                               # Lowercase
        text = re.sub(r'https?://[^\s\n\r]+', '', text)                   # Remove links
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)   # Remove punctuation
        text = re.sub('\w*\d\w*', '', text)                               # Remove words containing numbers
    
        yield ' '.join([word for word in text.split(' ') if word not in stopwords_english]) # Return a generator 

In [None]:
%%time

# We save the cleaned comments in a list to be easily manipulated
clean_comments = list(preprocess(train_data['comment_text']))

CPU times: user 38.9 s, sys: 141 ms, total: 39.1 s
Wall time: 41.1 s


In [None]:
for i in range(10):
    print(clean_comments[i])
    print('------------')

explanation
why edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired 
------------
daww matches background colour im seemingly stuck thanks  talk  january   utc
------------
hey man im really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info
------------

more
i cant make real suggestions improvement  wondered section statistics later subsection types accidents  think references may need tidying exact format ie date format etc later noone else first  preferences formatting style references want please let know

there appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipediagoodarticlenominationstransport  
------------
sir hero chance remember page thats
------------


congratulations well use tools well  · talk 
------------
cocksucker piss around work
-----------

In [None]:
%%time
# We do the same for the test set
test_clean_comments = list(preprocess(test["comment_text"]))

CPU times: user 14.4 s, sys: 54.1 ms, total: 14.4 s
Wall time: 14.7 s


# Word Embeddings

In [None]:
%%time

# Bag-of-words
vectorizer = CountVectorizer(min_df=3,max_df=0.9) #Filter words that are note present at least in min_df documents & no more that 90% of all documents
bow = vectorizer.fit_transform(clean_comments) #return a document-term matrix (n_samples,n_features)
bow_test = vectorizer.transform(test_clean_comments) # We do the same for test set, we just transform to have the same number of words

CPU times: user 7.31 s, sys: 119 ms, total: 7.43 s
Wall time: 7.45 s


In [None]:
bow.shape , bow_test.shape

((159571, 52731), (63978, 52731))

In [None]:
# Let's take a look at the features / vocabulary
print(vectorizer.get_feature_names_out()[:30])
print('------------')
print(vectorizer.get_feature_names_out()[100:130])
print('------------')
print(vectorizer.get_feature_names_out()[1000:1030])
print('------------')
print(vectorizer.get_feature_names_out()[10000:10030])

['aa' 'aaa' 'aaand' 'aac' 'aachen' 'aah' 'aaliyah' 'aamir' 'aan' 'aand'
 'aang' 'aap' 'aaps' 'aar' 'aardvark' 'aarem' 'aaron' 'aarons' 'aas'
 'aatalk' 'aau' 'aave' 'ab' 'aba' 'aback' 'abad' 'abaddon' 'abandon'
 'abandoned' 'abandoning']
------------
['abolish' 'abolished' 'abolishing' 'abolition' 'abolitionist'
 'abolitionists' 'abomb' 'abominable' 'abomination' 'abominations'
 'aboriginal' 'aboriginals' 'aborigine' 'aborigines' 'abort' 'aborted'
 'abortion' 'abortions' 'abot' 'abotu' 'abou' 'aboumekhael' 'abound'
 'abounds' 'abour' 'about' 'aboutcom' 'abouth' 'abouti' 'above']
------------
['agreement' 'agreements' 'agrees' 'agress' 'agressing' 'agression'
 'agressive' 'agressively' 'agressor' 'agricultural' 'agriculture'
 'agriculturists' 'agrizoophobia' 'aground' 'ags' 'aguilera' 'aguri' 'agw'
 'ah' 'aha' 'ahaha' 'ahahahahaha' 'aharon' 'ahd' 'ahead' 'ahem' 'ahh'
 'ahhh' 'ahhhh' 'ahhrelief']
------------
['copright' 'coproduced' 'coproducer' 'cops' 'coptic' 'copts' 'copula'
 'copulat

## Train & Test Preparation 

In [None]:
# Let's define target, which is the classification made by human
target = train_data[['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']]
# target = np.array(target) #transform dataframe into array
target.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


Let's check if target values are balanced.   
In other words, is the target made of as much toxic as non-toxic comments

In [None]:
target.sum(axis=0) / target.shape[0]

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

As we can see, the target set  is not balanced.

In [None]:
keys = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
test[keys].sum(axis=0) /test.shape[0]

toxic            0.095189
severe_toxic     0.005736
obscene          0.057692
threat           0.003298
insult           0.053565
identity_hate    0.011129
dtype: float64

In [None]:
# # We create the train and test sets using train_test_split
train_x, test_x, train_y, test_y = train_test_split(bow,target, test_size=0.20 ,random_state=0)