In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
csv.field_size_limit(999999)
train = pd.read_csv('raw_data/fulltrain.csv', header = None, names=['class','text'])
X_train = train['text']
y_train = train['class']

In [4]:
test = pd.read_csv("raw_data/balancedtest.csv", header = None, names=['class','text'])
X_test = test['text']
y_test = test['class']

## TF-IDF vectorizer

In [5]:
def tfidf(X_train, X_test):

  tfidf_baseline = TfidfVectorizer(ngram_range=(1,1), max_features = 10000)

  X_train_tfidf = tfidf_baseline.fit_transform(X_train)

  X_test_tfidf = tfidf_baseline.transform(X_test)

  return X_train_tfidf, X_test_tfidf

X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

In [6]:
def RF(X_train, y_train, X_test, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    return y_pred

In [7]:
RF(X_train_tfidf, y_train, X_test_tfidf, y_test)

              precision    recall  f1-score   support

           1       0.66      0.80      0.72       750
           2       0.57      0.17      0.26       750
           3       0.52      0.81      0.63       750
           4       0.79      0.74      0.76       750

    accuracy                           0.63      3000
   macro avg       0.63      0.63      0.59      3000
weighted avg       0.63      0.63      0.59      3000



array([1, 1, 1, ..., 1, 4, 4], dtype=int64)

## Word Embedding

In [8]:
def load_glove_model(glove_file_path):
    glove_model = {}
    with open(glove_file_path, 'r', encoding="utf8") as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            try:
                coefs = np.asarray(split_line[1:], dtype='float32')
            except ValueError:
                pass
            glove_model[word] = coefs
    return glove_model

glove_model = load_glove_model("glove.6B/glove.6B.300d.txt")

In [9]:
def preprocess_text(text):
    return word_tokenize(text.lower())

X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)

In [10]:
def document_vector(doc, model):
    # Filter out words that are not in the embedding
    embeddings = [model[word] for word in doc if word in model]
    if not embeddings:
        # If no words in the document are in the model, return a vector of zeros
        return np.zeros(next(iter(model.values())).shape)
    # Average the embeddings
    return np.mean(embeddings, axis=0)

X_train_vectors = np.array([document_vector(doc, glove_model) for doc in X_train_processed])
X_test_vectors = np.array([document_vector(doc, glove_model) for doc in X_test_processed])

In [11]:
RF(X_train_vectors, y_train, X_test_vectors, y_test)

              precision    recall  f1-score   support

           1       0.66      0.58      0.62       750
           2       0.58      0.41      0.48       750
           3       0.57      0.88      0.69       750
           4       0.85      0.72      0.78       750

    accuracy                           0.65      3000
   macro avg       0.66      0.65      0.64      3000
weighted avg       0.66      0.65      0.64      3000



array([1, 1, 1, ..., 4, 4, 4], dtype=int64)