In [6]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ParameterSampler
import json
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import sys


In [3]:
### Data preparation and preprocessing

# Load the dataset
with open('data_json/SubtaskA/subtaskA_train_monolingual.jsonl', 'r') as f:
    df = pd.read_json(f, lines=True, orient='records')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)


In [4]:
# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [8]:
# Train the SGDClassifier model
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3,
                    random_state=42, verbose=1, max_iter=10, tol=None)

with open("SGD_outputs/random_training.csv", "w") as f:
    # Redirect stdout to file
    sys.stdout = f
    clf.fit(X_train, y_train)
    # Reset stdout
    sys.stdout = sys.__stdout__

In [9]:
feature_names = vectorizer.get_feature_names_out()

# Get the coefficients of the trained model
coef = clf.coef_[0]

# Create a dictionary of feature names and coefficients
features_coef = dict(zip(feature_names, coef))

# Sort the dictionary by coefficient value
sorted_features = sorted(features_coef.items(), key=lambda x: x[1])

# Print the weights of the top 30 words to a file
with open("SGD_outputs/top_bottom_words.csv", "w") as f:
    for word, weight in sorted_features[-30:]:
        f.write(f"{word},{weight}\n")

    # Print the weights of the bottom 30 words to the same file
    for word, weight in sorted_features[:30]:
        f.write(f"{word},{weight}\n")

# Store all of the weights in a separate csv file
weights_df = pd.DataFrame(sorted_features, columns=['word', 'weight'])
weights_df.to_csv('SGD_outputs/weights.csv', index=False)


In [10]:
# Define the parameters
train_sizes = np.linspace(0.1, 1.0, 10)
cv = 5

# Calculate the learning curve
train_sizes, train_scores, test_scores = learning_curve(
    clf, X_train, y_train, train_sizes=train_sizes, cv=cv)

# Create a pandas DataFrame with the learning curve coordinates
df_learning_curve = pd.DataFrame({
    'train_sizes': train_sizes,
    'train_scores_mean': np.mean(train_scores, axis=1),
    'test_scores_mean': np.mean(test_scores, axis=1),
    'train_scores_std': np.std(train_scores, axis=1),
    'test_scores_std': np.std(test_scores, axis=1)
})

# Save the DataFrame to a csv file
df_learning_curve.to_csv('SGD_outputs/learning_curve.csv', index=False)


In [11]:
# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)


In [12]:
# Define the parameters
parameter_distribution = {'learning_rate': np.exp(np.linspace(np.log(0.0001), np.log(3), 10)),
                          'reguliser_dampening': np.exp(np.linspace(np.log(0.0001), np.log(3), 10))}

# Placeholder to make future comparissons easier
best_hyperparameters = None
print("Learning rate:\tReg.dampening:\tTraining set accuracy:")

for hyperparameters in ParameterSampler(parameter_distribution, n_iter=10):
  # Set up the classifier
  reguliser_dampening = hyperparameters['reguliser_dampening']
  learning_rate = hyperparameters['learning_rate']
  model = SGDClassifier(loss='hinge', penalty='l2',
                        alpha=reguliser_dampening, verbose=0,
                        learning_rate='constant', eta0=learning_rate)

  # Train the classifier
  model.fit(X_train, y_train)

  # Calculate the training accuracy
  training_accuracy = np.sum(model.predict(X_train) == y_train)/len(y_train)

  # Store the hyperparameters if they are better than what we have found before
  if best_hyperparameters is None or best_hyperparameters[1] < training_accuracy:
    best_hyperparameters = (hyperparameters, training_accuracy)
  print("%.5f\t\t%.5f\t\t%.1f%%" % (
      hyperparameters['learning_rate'], hyperparameters['reguliser_dampening'], 100*training_accuracy))

best_learning_rate = best_hyperparameters[0]['learning_rate']
best_reguliser_dampening = best_hyperparameters[0]['reguliser_dampening']
print("Best parameters: %.5f, %.5f" %
      (best_learning_rate, best_reguliser_dampening))


In [13]:
# Set up the classifier
model = SGDClassifier(loss='hinge', penalty='l2',
                      alpha=best_reguliser_dampening, verbose=1,
                      learning_rate='constant', eta0=best_learning_rate)

# Train on all the non-test data
model.fit(X_train, y_train)

# Run prediction on the test set
test_accuracy = np.sum(model.predict(X_test) == y_test)/len(y_test)

print("Test set accuracy %.1f%%" % (100*test_accuracy))


In [14]:
# Load the new dataset
with open("data_json/SubtaskA/subtaskA_dev_monolingual.jsonl", "r") as f:
    lines = f.readlines()

# Parse each line as a separate JSON object
data = []
for line in lines:
    obj = json.loads(line)
    data.append(obj)

# Convert the list of JSON objects to a pandas DataFrame
new_df = pd.DataFrame(data)

# Vectorize the text data using TfidfVectorizer
X_dev = vectorizer.transform(new_df['text'])
# Predict the labels for the new dataset
new_predictions = clf.predict(X_dev)

# Convert the predicted probabilities to binary labels
new_labels = [1 if p >= 0.5 else 0 for p in new_predictions]

# Store the predictions in a separate jsonl file
predictions = list(zip(new_df['id'], new_labels))
predictions_df = pd.DataFrame(predictions, columns=['id', 'label'])
predictions_df.to_json('SGD_outputs/dev_predictions.jsonl',
                       lines=True, orient='records')


In [17]:
# Make predictions on the testing set
y_pred2 = clf.predict(X_test)

# Print the classification report
report2 = classification_report(y_test, y_pred2)
print(report2)
