In [2]:
%%capture
%pip install -r requirements.txt
%pip install nltk

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC, LinearSVC, LinearSVR, SVR
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [4]:
# Reading data
def read_csv(file_path):
    """_summary_

    Args:
        file_path (_type_): _description_
    """
    text_data = pd.read_csv(file_path, sep=",")
    return text_data

train_path = "./data/train/train.csv"
test_path = "./data/test/test.csv"

In [5]:
train_data = read_csv(train_path)
test_data = read_csv(test_path)

In [6]:
train_data.head(3)

Unnamed: 0,ID,TEXT,LABEL
0,7850790573542594519,If you love good films don't ever buy this pei...,2
1,9392069522632994700,The 33 percent of the nations nitwits that sti...,2
2,5083704536542443514,I saw Anatomy years ago -- dubbed at a friends...,1


In [7]:
# Getting info from dataset
def explore_data(dataset):
    """_summary_

    Args:
        dataset (_type_): _description_
    """
    # dataset column value counts
    print(dataset['LABEL'].value_counts())
    
    # dataset null values
    print(train_data.isnull().sum())
    
    # dataset duplicate data info
    print(train_data.duplicated().sum())
    
    # dataset shape
    print(train_data.shape)
    
    # dataset info
    print(train_data.info())
    
    # null values
    print(f"Number of NULL rows: {dataset.isna().sum()}")

In [8]:
explore_data(train_data)

LABEL
0    32071
1    19276
2    18970
Name: count, dtype: int64
ID       0
TEXT     6
LABEL    0
dtype: int64
0
(70317, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70317 entries, 0 to 70316
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      70317 non-null  uint64
 1   TEXT    70311 non-null  object
 2   LABEL   70317 non-null  int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 1.6+ MB
None
Number of NULL rows: ID       0
TEXT     6
LABEL    0
dtype: int64


#### **Text Pre-processing**

In [9]:
def transform_column(dataset):
  dataset['TEXT'] = dataset['TEXT'].fillna('EMPTY REVIEW')
  dataset['TEXT'] = dataset['TEXT'].astype(str)
  return dataset

In [10]:
train_data = transform_column(train_data)
test_data = transform_column(test_data)

In [11]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\ual-
[nltk_data]     laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\ual-
[nltk_data]     laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
lemmatizer = WordNetLemmatizer()

def process_text(text):
    """Process the input text by tokenizing, lemmatizing, and removing stopwords.

    Args:
        text (str): Input text to be processed.

    Returns:
        str: Processed text.
    """
    tokens = word_tokenize(text.lower())
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    processed_text = " ".join(processed_tokens)
    return processed_text

train_data['TEXT'] = train_data['TEXT'].astype('str').apply(process_text)

#### **Text to features**

In [30]:
X_train = train_data['TEXT']
y_train = train_data['LABEL']
X_test  = test_data['TEXT']

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

classifier_svm = LinearSVC(class_weight="balanced", 
                           max_iter=2000, 
                           intercept_scaling=0.75,
                           verbose=2)
classifier_svm.fit(X_train_tfidf, y_train)



[LibLinear]

In [31]:
# training data accuracy
y_train_pred = classifier_svm.predict(X_train_tfidf)
train_acc = accuracy_score(y_train, y_train_pred)

print("Accuracy:", train_acc)
print(classification_report(y_train, y_train_pred))

Accuracy: 0.9997582376950097
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     32071
           1       1.00      1.00      1.00     19276
           2       1.00      1.00      1.00     18970

    accuracy                           1.00     70317
   macro avg       1.00      1.00      1.00     70317
weighted avg       1.00      1.00      1.00     70317



In [32]:
y_pred_final = classifier_svm.predict(X_test_tfidf)

submission_df = pd.DataFrame({'ID': test_data['ID'], 'LABEL': y_pred_final})
submission_df.to_csv('./data/submission/submission.csv', index=False)