In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
raw_training_data = pd.read_csv('data/train.csv')

In [2]:
# Showing the first five elements of the DataFrame
raw_training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# Showing the last five elements of the DataFrame
raw_training_data.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\r\nAnd ... I really don't think you understa...",0,0,0,0,0,0


In [4]:
# Showing all unique values in the classification rows
print(f"Unique values in the row \"toxic\": {raw_training_data['toxic'].unique()}")
print(f"Unique values in the row \"severe_toxic\": {raw_training_data['severe_toxic'].unique()}")
print(f"Unique values in the row \"obscene\": {raw_training_data['obscene'].unique()}")
print(f"Unique values in the row \"threat\": {raw_training_data['threat'].unique()}")
print(f"Unique values in the row \"insult\": {raw_training_data['insult'].unique()}")
print(f"Unique values in the row \"indentity_hate\": {raw_training_data['identity_hate'].unique()}")

Unique values in the row "toxic": [0 1]
Unique values in the row "severe_toxic": [0 1]
Unique values in the row "obscene": [0 1]
Unique values in the row "threat": [0 1]
Unique values in the row "insult": [0 1]
Unique values in the row "indentity_hate": [0 1]


In [5]:
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

def preprocess_text(text):
    "Proprocessing the comments of the raw data"

    # Convert text to lowercase
    text = text.lower()
    # removing usernames starting with '@'
    text = re.sub(r'@ ?\w+', '', text)
    # removing URL's
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # removing the substring "quot", which is an HTML entity for double quotation ("")
    text = re.sub(r'&quot', '', text)
    # removing all special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # removing all digits
    text = re.sub(r'\d', '', text)

    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [7]:
# applies the preprocessing function to all comments 
raw_training_data['comment_text'] = raw_training_data['comment_text'].apply(preprocess_text)

In [8]:
print(raw_training_data)

                      id                                       comment_text  \
0       0000997932d777bf  explanation edits made username hardcore metal...   
1       000103f0d9cfb60f  aww match background colour seemingly stuck th...   
2       000113f07ec002fd  hey man really trying edit war guy constantly ...   
3       0001b41b1c6bb37e  make real suggestion improvement wondered sect...   
4       0001d958c54c6e35                      sir hero chance remember page   
...                  ...                                                ...   
159566  ffe987279560d7ff  second time asking view completely contradicts...   
159567  ffea4adeee384e90               ashamed horrible thing put talk page   
159568  ffee36eab5c267c9  spitzer umm there actual article prostitution ...   
159569  fff125370e4aaaf3  look like actually put speedy first version de...   
159570  fff46fc426af1f9a  really think understand came idea bad right aw...   

        toxic  severe_toxic  obscene  threat  insul