In [2]:
import pandas as pd
import json
import pandas as pd
import numpy as np
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import learning_curve, ParameterSampler, train_test_split
from nltk.corpus import stopwords

In [3]:
# Data preparation and preprocessing

# Load the dataset
with open('data_json/SubtaskA/subtaskA_train_monolingual.jsonl', 'r') as f:
    df = pd.read_json(f, lines=True, orient='records')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)

In [4]:
# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [13]:
# Train the SGDClassifier model
clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-3,
                    random_state=42, verbose=0, max_iter=15, tol=None)

clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)

# Predict the probabilities of the test set labels
y_prob = clf.predict_proba(X_test)




In [16]:
classification_report_df = pd.DataFrame(
    classification_report(y_test, y_pred, output_dict=True)).transpose()
classification_report_df.to_csv(
    'SGD_outputs/classification_report.csv', index=False)

confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
confusion_df.to_csv('SGD_outputs/confusion_matrix.csv', index=False)


In [14]:
# Create a DataFrame from y_test and y_prob
roc = pd.DataFrame({
    'actual': y_test,
    'prob_0': [prob[0] for prob in y_prob],
    'prob_1': [prob[1] for prob in y_prob]
})

# Write the DataFrame to a CSV file
roc.to_csv("SGD_outputs/ROC.csv", index=False)

In [6]:
# Get the coefficients of the trained model
feature_names = vectorizer.get_feature_names_out()
coef = clf.coef_[0]

# Create a dictionary of feature names and coefficients
features_coef = dict(zip(feature_names, coef))

# Sort the dictionary by coefficient value
sorted_features = sorted(features_coef.items(), key=lambda x: x[1])

# Print the weights of the top 30 words to a file
with open("SGD_outputs/top_bottom_words.csv", "w") as f:
    for word, weight in sorted_features[-30:]:
        f.write(f"{word},{weight}\n")

    # Print the weights of the bottom 30 words to the same file
    for word, weight in sorted_features[:30]:
        f.write(f"{word},{weight}\n")

# Store all of the weights in a separate csv file
weights_df = pd.DataFrame(sorted_features, columns=['word', 'weight'])
weights_df.to_csv('SGD_outputs/weights.csv', index=False)


In [7]:
# Define the parameters
train_sizes = np.linspace(0.1, 1.0, 10)
cv = 5

# Calculate the learning curve
train_sizes, train_scores, test_scores = learning_curve(
    clf, X_train, y_train, train_sizes=train_sizes, cv=cv, verbose=0)

# Create a pandas DataFrame with the learning curve coordinates
df_learning_curve = pd.DataFrame({
    'train_sizes': train_sizes,
    'train_scores_mean': np.mean(train_scores, axis=1),
    'test_scores_mean': np.mean(test_scores, axis=1),
    'train_scores_std': np.std(train_scores, axis=1),
    'test_scores_std': np.std(test_scores, axis=1)
})

# Save the DataFrame to a csv file
df_learning_curve.to_csv('SGD_outputs/learning_curve.csv', index=False)


In [8]:
# Define the parameters
parameter_distribution = {'learning_rate': np.exp(np.linspace(np.log(0.0001), np.log(3), 10)),
                          'reguliser_dampening': np.exp(np.linspace(np.log(0.0001), np.log(3), 10))}

# Placeholder to make future comparissons easier
best_hyperparameters = None
print("Learning rate:\tReg.dampening:\tTraining set accuracy:")

for hyperparameters in ParameterSampler(parameter_distribution, n_iter=15):
  # Set up the classifier
  reguliser_dampening = hyperparameters['reguliser_dampening']
  learning_rate = hyperparameters['learning_rate']
  model = SGDClassifier(loss='hinge', penalty='l2',
                        alpha=reguliser_dampening, verbose=0,
                        learning_rate='constant', eta0=learning_rate)

  # Train the classifier
  model.fit(X_train, y_train)

  # Calculate the training accuracy
  training_accuracy = np.sum(model.predict(X_train) == y_train)/len(y_train)

  # Store the hyperparameters if they are better than what we have found before
  if best_hyperparameters is None or best_hyperparameters[1] < training_accuracy:
    best_hyperparameters = (hyperparameters, training_accuracy)
  print("%.5f\t\t%.5f\t\t%.1f%%" % (
      hyperparameters['learning_rate'], hyperparameters['reguliser_dampening'], 100*training_accuracy))

best_learning_rate = best_hyperparameters[0]['learning_rate']
best_reguliser_dampening = best_hyperparameters[0]['reguliser_dampening']
print("Best parameters: %.5f, %.5f" %
      (best_learning_rate, best_reguliser_dampening))


Learning rate:	Reg.dampening:	Training set accuracy:
0.00099		0.00311		59.5%
0.03071		0.00099		67.0%
0.00311		0.03071		53.1%
0.00977		0.30353		53.1%
0.00031		3.00000		53.1%
3.00000		0.00977		53.1%
0.30353		0.30353		53.1%
0.00977		0.09655		53.1%
0.30353		0.00977		56.5%
3.00000		0.09655		53.1%
3.00000		0.95425		53.1%
0.00977		0.00099		68.1%
0.09655		0.00031		83.8%
0.00010		0.00977		59.3%
3.00000		3.00000		53.1%
Best parameters: 0.09655, 0.00031


In [9]:
# Set up the classifier
model = SGDClassifier(loss='hinge', penalty='l2',
                      alpha=best_reguliser_dampening, verbose=1,
                      learning_rate='constant', eta0=best_learning_rate)

# Train on all the non-test data
model.fit(X_train, y_train)

# Run prediction on the test set
test_accuracy = np.sum(model.predict(X_test) == y_test)/len(y_test)

print("Test set accuracy %.1f%%" % (100*test_accuracy))


-- Epoch 1
Norm: 29.21, NNZs: 352566, Bias: 0.115859, T: 95805, Avg. loss: 0.583999
Total training time: 0.27 seconds.
-- Epoch 2
Norm: 29.47, NNZs: 357804, Bias: 0.150616, T: 191610, Avg. loss: 0.520653
Total training time: 0.57 seconds.
-- Epoch 3
Norm: 29.75, NNZs: 359861, Bias: 0.146754, T: 287415, Avg. loss: 0.519226
Total training time: 0.82 seconds.
-- Epoch 4
Norm: 29.79, NNZs: 361334, Bias: 0.109100, T: 383220, Avg. loss: 0.519277
Total training time: 1.04 seconds.
-- Epoch 5
Norm: 29.63, NNZs: 362165, Bias: 0.147720, T: 479025, Avg. loss: 0.519213
Total training time: 1.20 seconds.
-- Epoch 6
Norm: 29.62, NNZs: 362913, Bias: 0.137099, T: 574830, Avg. loss: 0.519251
Total training time: 1.38 seconds.
-- Epoch 7
Norm: 29.53, NNZs: 363489, Bias: 0.142892, T: 670635, Avg. loss: 0.519175
Total training time: 1.53 seconds.
-- Epoch 8
Norm: 29.60, NNZs: 364047, Bias: 0.157375, T: 766440, Avg. loss: 0.519035
Total training time: 1.69 seconds.
Convergence after 8 epochs took 1.69 seco

In [10]:
# Load the new dataset
with open("data_json/SubtaskA/subtaskA_dev_monolingual.jsonl", "r") as f:
    lines = f.readlines()

# Parse each line as a separate JSON object
data = []
for line in lines:
    obj = json.loads(line)
    data.append(obj)

# Convert the list of JSON objects to a pandas DataFrame
dev_df = pd.DataFrame(data)

# Vectorize the text data using TfidfVectorizer
X_dev = vectorizer.transform(dev_df['text'])

# Predict the labels for the new dataset
new_predictions = clf.predict(X_dev)

# Convert the predicted probabilities to binary labels
new_labels = [1 if p >= 0.5 else 0 for p in new_predictions]

# Store the predictions in a separate jsonl file
predictions = list(zip(dev_df['id'], new_labels))
predictions_df = pd.DataFrame(predictions, columns=['id', 'label'])
predictions_df.to_json('SGD_outputs/dev_predictions.jsonl',
                       lines=True, orient='records')


[0 0 0 0 0 0 0 0 0 0]


In [17]:
classification_report2_df = pd.DataFrame(
    classification_report(dev_df['label'], new_labels, output_dict=True)).transpose()
classification_report2_df.to_csv(
    'SGD_outputs/classification_report2.csv', index=False)
