In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
training_data = pd.read_csv('data/train.csv')

In [2]:
# Showing the first five elements of the DataFrame
training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# Showing the last five elements of the DataFrame
training_data.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\r\nAnd ... I really don't think you understa...",0,0,0,0,0,0


In [4]:
# Showing all unique values in the classification rows
print(f"Unique values in the row \"toxic\": {training_data['toxic'].unique()}")
print(f"Unique values in the row \"severe_toxic\": {training_data['severe_toxic'].unique()}")
print(f"Unique values in the row \"obscene\": {training_data['obscene'].unique()}")
print(f"Unique values in the row \"threat\": {training_data['threat'].unique()}")
print(f"Unique values in the row \"insult\": {training_data['insult'].unique()}")
print(f"Unique values in the row \"indentity_hate\": {training_data['identity_hate'].unique()}")

Unique values in the row "toxic": [0 1]
Unique values in the row "severe_toxic": [0 1]
Unique values in the row "obscene": [0 1]
Unique values in the row "threat": [0 1]
Unique values in the row "insult": [0 1]
Unique values in the row "indentity_hate": [0 1]


In [5]:
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

def preprocess_text(text):
    "Proprocessing the comments of the raw data"

    # Convert text to lowercase
    text = text.lower()
    # removing usernames starting with '@'
    text = re.sub(r'@ ?\w+', '', text)
    # removing URL's
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # removing the substring "quot", which is an HTML entity for double quotation ("")
    text = re.sub(r'&quot', '', text)
    # removing all special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # removing all digits
    text = re.sub(r'\d', '', text)

    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [7]:
# applies the preprocessing function to all comments 
training_data['comment_text'] = training_data['comment_text'].apply(preprocess_text)

In [8]:
print(training_data)

                      id                                       comment_text  \
0       0000997932d777bf  explanation edits made username hardcore metal...   
1       000103f0d9cfb60f  aww match background colour seemingly stuck th...   
2       000113f07ec002fd  hey man really trying edit war guy constantly ...   
3       0001b41b1c6bb37e  make real suggestion improvement wondered sect...   
4       0001d958c54c6e35                      sir hero chance remember page   
...                  ...                                                ...   
159566  ffe987279560d7ff  second time asking view completely contradicts...   
159567  ffea4adeee384e90               ashamed horrible thing put talk page   
159568  ffee36eab5c267c9  spitzer umm there actual article prostitution ...   
159569  fff125370e4aaaf3  look like actually put speedy first version de...   
159570  fff46fc426af1f9a  really think understand came idea bad right aw...   

        toxic  severe_toxic  obscene  threat  insul

In [9]:
!pip install gensim




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
!pip install scikit-learn pandas




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


def create_tfidf_model(data : pd.core.frame.DataFrame)->None:
    "Creates the tf-idf model and saves it"
    vectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.8)
    tfidf_matrix = vectorizer.fit_transform(data["comment_text"].to_list())

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    with open('TF-IDF/tfidf_vectorizer.pkl', 'wb') as file:
        pickle.dump(vectorizer, file)



In [12]:
from gensim.models import Word2Vec

def train_word2vec_model(training_data : pd.core.frame.DataFrame)->None:
    "Trains the word2vec model and saves it"
    # converting datafram column into a list of strings
    tokenized_sentences = []
    for sentence in training_data["comment_text"].to_list():
        tokenized_sentences.append(sentence.split())

    # train word2vec model
    word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

    # save model
    word2vec_model.save("word2vec_model.model")


In [13]:
word2vec_model = Word2Vec.load("word2vec/word2vec_model.model")

In [14]:
def apply_word2vec(sentence : str)->list:
    "uses word2vec to vectorize a sentence"
    word_vectors = [word2vec_model.wv[word] for word in sentence.split() if word in word2vec_model.wv]
    return word_vectors
    

In [22]:
# APPLY WORD2VEC
training_data['comment_text'] = training_data['comment_text'].apply(apply_word2vec)
print(training_data)

                      id                                       comment_text  \
0       0000997932d777bf  explanation edits made username hardcore metal...   
1       000103f0d9cfb60f  aww match background colour seemingly stuck th...   
2       000113f07ec002fd  hey man really trying edit war guy constantly ...   
3       0001b41b1c6bb37e  make real suggestion improvement wondered sect...   
4       0001d958c54c6e35                      sir hero chance remember page   
...                  ...                                                ...   
159566  ffe987279560d7ff  second time asking view completely contradicts...   
159567  ffea4adeee384e90               ashamed horrible thing put talk page   
159568  ffee36eab5c267c9  spitzer umm there actual article prostitution ...   
159569  fff125370e4aaaf3  look like actually put speedy first version de...   
159570  fff46fc426af1f9a  really think understand came idea bad right aw...   

        toxic  severe_toxic  obscene  threat  insul

In [28]:
create_tfidf_model(training_data)

         __  _noticeboard   aa  aaron   ab  abandon  abandoned  abbey  \
0       0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
1       0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
2       0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
3       0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
4       0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
...     ...           ...  ...    ...  ...      ...        ...    ...   
159566  0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
159567  0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
159568  0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
159569  0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   
159570  0.0           0.0  0.0    0.0  0.0      0.0        0.0    0.0   

        abbreviation  abc  ...  zero  zeus  zinc  zionism  zionist  zoe  \
0                0.0  0.0  ...   0.0   0.0   0.0

In [29]:
# APPLY TF-IDF
with open('TF-IDF/tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer_pickle = pickle.load(file)
new_X = loaded_vectorizer_pickle.transform(training_data["comment_text"].to_list())

In [30]:
print(new_X)

  (0, 1607)	0.30405713848074467
  (0, 2879)	0.14531612893393472
  (0, 3241)	0.19893795086719904
  (0, 3287)	0.2654554271852859
  (0, 3336)	0.2141972155976803
  (0, 3722)	0.2698575976776164
  (0, 4020)	0.3005751191711747
  (0, 5331)	0.14208892676056953
  (0, 5576)	0.3450061195119402
  (0, 5946)	0.14286435579357948
  (0, 6341)	0.09365630563113972
  (0, 6648)	0.10625846190140026
  (0, 7412)	0.16222106807714726
  (0, 7544)	0.2727849817452694
  (0, 7571)	0.17595865365306546
  (0, 8153)	0.14528735663500886
  (0, 8795)	0.09926187397041963
  (0, 8865)	0.17886520760860894
  (0, 9454)	0.2093687767408963
  (0, 9498)	0.16306213262942273
  (0, 9633)	0.263835867216898
  (0, 9971)	0.23015813010457778
  (1, 745)	0.3370734626981767
  (1, 1686)	0.42481028742626853
  (1, 4775)	0.33668267906662697
  :	:
  (159568, 7637)	0.38593865220713075
  (159568, 8941)	0.4144124651110586
  (159568, 9271)	0.43921309775884115
  (159569, 109)	0.3169511991333879
  (159569, 2359)	0.30405550241427015
  (159569, 3469)	0.2801