In [23]:
%%capture
%pip install -r requirements.txt
%pip install spacy
%pip install gensim
!python -m spacy download en_core_web_sm

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec


In [3]:
# Reading data
def read_csv(file_path):
    """_summary_

    Args:
        file_path (_type_): _description_
    """
    text_data = pd.read_csv(file_path, sep=",")
    return text_data

train_path = "./data/train/train.csv"
test_path = "./data/test/test.csv"

In [4]:
train_data = read_csv(train_path)
test_data = read_csv(test_path)

In [5]:
train_data.head(3)

Unnamed: 0,ID,TEXT,LABEL
0,7850790573542594519,If you love good films don't ever buy this pei...,2
1,9392069522632994700,The 33 percent of the nations nitwits that sti...,2
2,5083704536542443514,I saw Anatomy years ago -- dubbed at a friends...,1


In [6]:
# Getting info from dataset
def explore_data(dataset):
    """_summary_

    Args:
        dataset (_type_): _description_
    """
    # dataset column value counts
    print(dataset['LABEL'].value_counts())
    
    # dataset null values
    print(train_data.isnull().sum())
    
    # dataset duplicate data info
    print(train_data.duplicated().sum())
    
    # dataset shape
    print(train_data.shape)
    
    # dataset info
    print(train_data.info())

In [7]:
explore_data(train_data)

LABEL
0    32071
1    19276
2    18970
Name: count, dtype: int64
ID       0
TEXT     6
LABEL    0
dtype: int64
0
(70317, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70317 entries, 0 to 70316
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      70317 non-null  uint64
 1   TEXT    70311 non-null  object
 2   LABEL   70317 non-null  int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 1.6+ MB
None


#### **Text Pre-processing**

In [18]:
nlp = spacy.load("en_core_web_sm")

def process_text(text):
    doc = nlp(text)
    
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    processed_text = " ".join(processed_tokens)
    
    return processed_text

train_data['TEXT'] = train_data['TEXT'].astype('str').apply(process_text)

In [19]:
def text_preprocess(txt: str):
    """_summary_

    Args:
        txt (_type_): _description_

    Returns:
        _type_: _description_
    """
    txt = txt.lower()
    
    txt = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))', '<URL>', txt)

    txt = re.sub(r'\d+', '<PHONE>', txt)
    
    txt = re.sub(r'[^a-zA-Z\s]', '', txt)
    
    txt = re.sub(r'[^\w\s]', '<PUNCT>', txt)

    txt = re.sub(r'\b\w\b', '<SNGL>', txt)

    txt = re.sub(r'\s+', '<SPC>', txt).strip()
    
    return txt

In [20]:
train_data['TEXT'] = train_data['TEXT'].astype('str').apply(text_preprocess)

#### **Text to features**

In [25]:
X_train, X_test, y_train, y_test = train_test_split(train_data['TEXT'], train_data['LABEL'], test_size=0.2, random_state=42)

word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

def wordlist_to_vector(word_list):
    feature_vector = np.zeros((word2vec_model.vector_size,), dtype="float32")
    num_words = 0
    for word in word_list:
        if word in word2vec_model.wv:
            feature_vector = np.add(feature_vector, word2vec_model.wv[word])
            num_words += 1
    if num_words != 0:
        feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [26]:
X_train_vectors = [wordlist_to_vector(review) for review in X_train]
X_test_vectors = [wordlist_to_vector(review) for review in X_test]

In [27]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vectors, y_train)

y_pred = classifier.predict(X_test_vectors)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.83      0.71      6454
           1       0.48      0.35      0.40      3856
           2       0.49      0.36      0.41      3754

    accuracy                           0.57     14064
   macro avg       0.53      0.51      0.51     14064
weighted avg       0.55      0.57      0.55     14064



In [29]:
test_data['TEXT'] = test_data['TEXT'].astype('str').apply(process_text)
test_data['TEXT'] = test_data['TEXT'].apply(text_preprocess)

X_test_final = [wordlist_to_vector(review) for review in test_data['TEXT']]

y_pred_final = classifier.predict(X_test_final)

In [31]:
og_test_data = read_csv(test_path)

submission_df = pd.DataFrame({'ID': og_test_data['ID'], 'LABEL': y_pred_final})

submission_df.to_csv('./data/submission/submission.csv', index=False)