### Initial Model

This model has been created with the purpose of predicting the rest of the unlabelled dataset.

Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [16]:
# Load the labeled dataset
labelled_data_path = 'labelled_data.csv'
labelled_data = pd.read_csv(labelled_data_path)

# Display the first few rows of the dataset, its summary, and the balance of the classes
print(labelled_data.head())
print(labelled_data.describe())
print(labelled_data['sentiment'].value_counts())

   id                                            comment  score  sentiment
0  12  It's incredibly sickening when somebody with a...    2.0          1
1  20  If Gakpo isn't scoring or assisting does he of...    4.0          1
2  21  No, and he only ever turns up against average ...    1.0          1
3  22                                       Meady player    2.0          1
4  26  Plays like hes on the piss tbf. Guy sees open ...    1.0          1
                id        score    sentiment
count  1077.000000  1046.000000  1077.000000
mean    568.675023     5.979924     0.211699
std     388.233441    56.021720     0.408702
min       1.000000   -76.000000     0.000000
25%     271.000000     1.000000     0.000000
50%     540.000000     2.000000     0.000000
75%     809.000000     5.000000     0.000000
max    2465.000000  1788.000000     1.000000
sentiment
0    849
1    228
Name: count, dtype: int64


In [17]:
# Fill missing values in the 'comment' column with empty strings
labelled_data['comment'].fillna('', inplace=True)

In [18]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(labelled_data['comment'], labelled_data['sentiment'], test_size=0.2, random_state=42)

# Creates a pipeline including TF-IDF vectorization followed by Logistic Regression
pipeline = make_pipeline(TfidfVectorizer(min_df=5, norm='l2', ngram_range=(1, 2)), LogisticRegression(max_iter=1000))

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the validation set
y_pred = pipeline.predict(X_val)

In [19]:
# Evaluate the model
print("Accuracy on the validation set:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

Accuracy on the validation set: 0.8009259259259259

Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89       171
           1       1.00      0.04      0.09        45

    accuracy                           0.80       216
   macro avg       0.90      0.52      0.49       216
weighted avg       0.84      0.80      0.72       216



Initial base model shows incredibly low scores on the recall and f1-score.

We will look at balancing the class weights in order to pay more attention to the minority class 

In [21]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Update the pipeline using the computed class weights
pipeline_with_weights = make_pipeline(
    TfidfVectorizer(min_df=5, norm='l2', ngram_range=(1, 2)),
    LogisticRegression(max_iter=1000, class_weight=class_weights_dict)
)

# Train the updated model
pipeline_with_weights.fit(X_train, y_train)

# Predict on the validation set and evaluate the model again
y_pred_with_weights = pipeline_with_weights.predict(X_val)
accuracy_with_weights = accuracy_score(y_val, y_pred_with_weights)
classification_report_with_weights = classification_report(y_val, y_pred_with_weights)

print(accuracy_with_weights)
print(classification_report_with_weights)

0.7638888888888888
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       171
           1       0.44      0.51      0.47        45

    accuracy                           0.76       216
   macro avg       0.65      0.67      0.66       216
weighted avg       0.78      0.76      0.77       216



We can see balancing the weight classes has provided us with a slightly better recall and f1-score with a trade off in precision

Let's look at creating an SVM model to see if that performs better

In [23]:
# Update pipeline
pipeline_with_svm = make_pipeline(
    TfidfVectorizer(min_df=5, norm='l2', ngram_range=(1, 2)),
    LinearSVC(class_weight='balanced', max_iter=10000)
)

# Train the updated model with SVM
pipeline_with_svm.fit(X_train, y_train)

# Predict on the validation set and evaluate the model again
y_pred_with_svm = pipeline_with_svm.predict(X_val)
accuracy_with_svm = accuracy_score(y_val, y_pred_with_svm)
classification_report_with_svm = classification_report(y_val, y_pred_with_svm)

print(accuracy_with_svm) 
print(classification_report_with_svm)

0.7546296296296297
              precision    recall  f1-score   support

           0       0.86      0.83      0.84       171
           1       0.42      0.47      0.44        45

    accuracy                           0.75       216
   macro avg       0.64      0.65      0.64       216
weighted avg       0.76      0.75      0.76       216





Similar results to the Logistic Regression Model

Next we will tune the hyperparameters of our SVM model with GridSearchCV. Where we will look at modifying regularisation and tolerance then print the results of the best parameters found.

In [24]:
# Define a parameter grid to search over
param_grid = {
    'linearsvc__C': [0.01, 0.1, 1, 10, 100],
    'linearsvc__tol': [1e-4, 1e-3, 1e-2]
}

# Create a GridSearchCV object with the SVM pipeline
grid_search = GridSearchCV(pipeline_with_svm, param_grid, scoring='f1_macro', cv=5, verbose=1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and score
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

# Predict on the validation set with the best parameters and evaluate
y_pred_best = grid_search.predict(X_val)
accuracy_best = accuracy_score(y_val, y_pred_best)
classification_report_best = classification_report(y_val, y_pred_best)

print(best_parameters)
print(best_score) 
print(accuracy_best) 
print(classification_report_best)


Fitting 5 folds for each of 15 candidates, totalling 75 fits








{'linearsvc__C': 1, 'linearsvc__tol': 0.0001}
0.6858727188727848
0.7546296296296297
              precision    recall  f1-score   support

           0       0.86      0.83      0.84       171
           1       0.42      0.47      0.44        45

    accuracy                           0.75       216
   macro avg       0.64      0.65      0.64       216
weighted avg       0.76      0.75      0.76       216





Now we will load the unlabeled dataset and use our best model to predict their sentiment and export it to .csv.

In [25]:
# Load the unlabeled dataset
unlabeled_data_path = 'all_data.csv'  
unlabeled_data = pd.read_csv(unlabeled_data_path)


unlabeled_data['predicted_sentiment'] = grid_search.predict(unlabeled_data['comment'])
labeled_data_path = 'labeled_data.csv'
unlabeled_data.to_csv(labeled_data_path, index=False)

print("Labeled data saved to:", labeled_data_path)


Labeled data saved to: labeled_data.csv
