In [None]:
!pip install -U nltk

In [None]:
import pandas as pd
import re
import json
from nltk.corpus import stopwords
import nltk
import numpy as np
nltk.download('stopwords')

from nltk.util import pad_sequence
from nltk.util import ngrams
from nltk import word_tokenize, sent_tokenize
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

stop_words = stopwords.words('english')

In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
load_paths = ['../../data/train_data.csv', '../../data/valid_data.csv', '../../data/test_data.csv']
load_paths = ['./train_data.csv', './valid_data.csv', './test_data.csv']

train_data = pd.read_csv(load_paths[0])
valid_data = pd.read_csv(load_paths[1])
test_data = pd.read_csv(load_paths[2])

In [None]:
def filter_unwanted_rows(data):
    y = data['Label'].tolist()
    to_drop = [i for i in range(len(y)) if y[i] > 3]
    new_data = data.drop(labels=to_drop, axis=0)
    return np.asarray(new_data['Source']), np.asarray(new_data['Label'])

In [None]:
X_train, y_train = filter_unwanted_rows(train_data)
X_test, y_test = filter_unwanted_rows(test_data)
X_valid, y_valid = filter_unwanted_rows(valid_data)

y_train = np.asarray([y_train[i] for i in range(len(y_train)) if len(X_train[i]) <= 550])
X_train = np.asarray([x for x in X_train if len(x) <= 550])
y_test = np.asarray([y_test[i] for i in range(len(y_test)) if len(X_test[i]) <= 550])
X_test = np.asarray([x for x in X_test if len(x) <= 550])
y_valid = np.asarray([y_valid[i] for i in range(len(y_valid)) if len(X_valid[i]) <= 550])
X_valid = np.asarray([x for x in X_valid if len(x) <= 550])

In [None]:
# Different techniques for tackling class imbalance
from imblearn.over_sampling import RandomOverSampler, SMOTE

def balance_data(x, y, _type=0):
    if _type == 1:
        ros = RandomOverSampler(random_state=42)
        return ros.fit_resample(x, y)
    elif _type == 2:
        smote = SMOTE()
        return smote.fit_resample(x, y)
    return x, y

In [None]:
def minor_clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub("[\<\[].*?[\>\]]", " ", text)
    text = re.sub(r"[^a-z ]", " ", text)
    text = re.sub(r"\b\w{1,2}\b", " ", text)
    return " ".join([x for x in text.split() if x not in stop_words])

In [None]:
from collections import Counter

print("Train labels ratio")
print(Counter(y_train.tolist()))
print("Test labels ratio")
print(Counter(y_test.tolist()))
print("Validation labels ratio")
print(Counter(y_valid.tolist()))

Train labels ratio
Counter({3: 14012, 2: 1929, 1: 64})
Test labels ratio
Counter({3: 3015, 2: 427, 1: 12})
Validation labels ratio
Counter({3: 3000, 2: 438, 1: 9})


In [None]:
X_train = [minor_clean_text(text) for text in X_train]
X_valid = [minor_clean_text(text) for text in X_valid]
X_test = [minor_clean_text(text) for text in X_test]

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, precision_score, f1_score, recall_score, classification_report
import matplotlib.pyplot as plt
import json

statements = {0: "Without oversampling", 1: "With random oversampling", 2: "With SMOTE"}

def get_metrics(b, ytrue, ypred, file_name, folder_name):
    print('\n\n')
    print(statements[b] + '\n')
    print(classification_report(ytrue, ypred))
    ret = classification_report(ytrue, ypred, output_dict=True)
    ConfusionMatrixDisplay.from_predictions(ytrue, ypred)
    plt.savefig(f"{folder_name}/{file_name}_conf.png",dpi=300)
    plt.show()
    print('\n\n')
    with open(f"{folder_name}/{file_name}_stats.json", "w") as f:
        json.dump(ret, f, indent=4) 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=6000)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train).toarray()
X_valid = vectorizer.transform(X_valid).toarray()
X_test = vectorizer.transform(X_test).toarray()

X_test = np.concatenate((X_test, X_valid), axis=0)
y_test = np.concatenate((y_test, y_valid))

print(X_test.shape)
print(y_test.shape)

(6901, 6000)
(6901,)


In [None]:
# RF
from sklearn.ensemble import RandomForestClassifier

for t in range(3):
    X_train, y_train = balance_data(X_train, y_train, t)

    print("Class distribution:", Counter(y_train))
    rf_model = RandomForestClassifier(random_state=123).fit(X_train, y_train)
    print("\nFor training set\n")
    train_pred = rf_model.predict(X_train)
    get_metrics(t, y_train, train_pred, f"RF_train_{t}", "RF")
    print("\nFor test set\n")
    test_pred = rf_model.predict(X_test)
    get_metrics(t, y_test, test_pred, f"RF_test_{t}", "RF")
    print('-'*210)

In [None]:
# DT
from sklearn.tree import DecisionTreeClassifier

for t in range(3):
    X_train, y_train = balance_data(X_train, y_train, t)

    print("Class distribution:", Counter(y_train))
    dt_model = DecisionTreeClassifier(random_state=123).fit(X_train, y_train)
    print("\nFor training set\n")
    train_pred = dt_model.predict(X_train)
    get_metrics(t, y_train, train_pred, f"DT_train_{t}", "DT")
    print("\nFor test set\n")
    test_pred = dt_model.predict(X_test)
    get_metrics(t, y_test, test_pred, f"DT_test_{t}", "DT")
    print('-'*210)

In [None]:
# SVM
from sklearn import svm

for t in range(3):
    if t == 0:
        X_tr, y_tr = balance_data(X_train, y_train, t)
    else:
        X_tr, y_tr = balance_data(X_train[:5500], y_train[:5500], t)
    print("Class distribution:", Counter(y_tr))
    svm_model = svm.SVC().fit(X_tr, y_tr)
    print("\nFor training set\n")
    train_pred = svm_model.predict(X_tr)
    get_metrics(t, y_tr, train_pred, f"SVM_train_{t}", "SVM")
    print("\nFor test set\n")
    test_pred = svm_model.predict(X_test)
    get_metrics(t, y_test, test_pred, f"SVM_test_{t}", "SVM")
    print('-'*210)

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

for t in range(3):
    X_train, y_train = balance_data(X_train, y_train, t)

    print("Class distribution:", Counter(y_train))
    knn_model = KNeighborsClassifier(n_neighbors=203).fit(X_train, y_train)
    print("\nFor training set\n")
    train_pred = knn_model.predict(X_train)
    get_metrics(t, y_train, train_pred, f"KNN_train_{t}", "KNN")
    print("\nFor test set\n")
    test_pred = knn_model.predict(X_test)
    get_metrics(t, y_test, test_pred, f"KNN_test_{t}", "KNN")
    print('-'*210)

In [None]:
# !zip -r SVM.zip SVM

In [17]:
# from google.colab import files
# files.download("SVM.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>