In [45]:
import pandas as pd
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import learning_curve, ParameterSampler, train_test_split
from nltk.corpus import stopwords

In [46]:
# Data preparation and preprocessing

# Load the dataset
with open('data_json/SubtaskB/subtaskB_train.jsonl', 'r') as f:
    df = pd.read_json(f, lines=True, orient='records')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['model'], test_size=0.2, random_state=42)

In [47]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on the string labels
label_encoder.fit(y_train)

# Transform the string labels to integer values
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)


In [48]:
# Vectorize the text data using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [49]:
# Train the SGDClassifier model
clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=1e-3,
                    random_state=42, verbose=0, max_iter=25, tol=None)

clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)

# Predict the probabilities of the test set labels
y_prob = clf.predict_proba(X_test)


In [50]:
classification_report_df = pd.DataFrame(
    classification_report(y_test, y_pred, output_dict=True)).transpose()
classification_report_df.to_csv(
    'statistics/SGD_B_outputs/classification_report.csv', index=False)

confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
confusion_df.to_csv('statistics/SGD_B_outputs/confusion_matrix.csv', index=False)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.61      0.93      0.74      2404
           1       0.63      0.57      0.60      2441
           2       0.53      0.52      0.52      2204
           3       0.55      0.48      0.51      2407
           4       0.45      0.28      0.34      2360
           5       0.52      0.55      0.53      2390

    accuracy                           0.56     14206
   macro avg       0.55      0.56      0.54     14206
weighted avg       0.55      0.56      0.54     14206



In [51]:
# Create a DataFrame from y_test and y_prob
roc = pd.DataFrame({
    'actual': y_test,
    'prob_0': [prob[0] for prob in y_prob],
    'prob_1': [prob[1] for prob in y_prob]
})

# Write the DataFrame to a CSV file
roc.to_csv("statistics/SGD_B_outputs/ROC.csv", index=False)

In [52]:
# Get the coefficients of the trained model
feature_names = vectorizer.get_feature_names_out()
coef = clf.coef_[0]

# Create a dictionary of feature names and coefficients
features_coef = dict(zip(feature_names, coef))

# Sort the dictionary by coefficient value
sorted_features = sorted(features_coef.items(), key=lambda x: x[1])

# Print the weights of the top 30 words to a file
with open("statistics/SGD_B_outputs/top_bottom_words.csv", "w") as f:
    f.write(f"word,weight\n")
    for word, weight in sorted_features[-30:]:
        f.write(f"{word},{weight}\n")

    # Print the weights of the bottom 30 words to the same file
    for word, weight in sorted_features[:30]:
        f.write(f"{word},{weight}\n")

# Store all of the weights in a separate csv file
weights_df = pd.DataFrame(sorted_features, columns=['word', 'weight'])
weights_df.to_csv('statistics/SGD_B_outputs/weights.csv', index=False)


In [53]:
# # Define the parameters
# train_sizes = np.linspace(0.1, 1.0, 10)
# cv = 5

# # Calculate the learning curve
# train_sizes, train_scores, test_scores = learning_curve(
#     clf, X_train, y_train, train_sizes=train_sizes, cv=cv, verbose=0)

# # Create a pandas DataFrame with the learning curve coordinates
# df_learning_curve = pd.DataFrame({
#     'train_sizes': train_sizes,
#     'train_scores_mean': np.mean(train_scores, axis=1),
#     'test_scores_mean': np.mean(test_scores, axis=1),
#     'train_scores_std': np.std(train_scores, axis=1),
#     'test_scores_std': np.std(test_scores, axis=1)
# })

# # Save the DataFrame to a csv file
# df_learning_curve.to_csv('statistics/SGD_B_outputs/learning_curve.csv', index=False)


KeyboardInterrupt: 

In [54]:
# Define the parameters
parameter_distribution = {'learning_rate': np.exp(np.linspace(np.log(0.0001), np.log(3), 10)),
                          'reguliser_dampening': np.exp(np.linspace(np.log(0.0001), np.log(3), 10))}

# Placeholder to make future comparissons easier
best_hyperparameters = None
print("Learning rate:\tReg.dampening:\tTraining set accuracy:")

for hyperparameters in ParameterSampler(parameter_distribution, n_iter=15):
  # Set up the classifier
  reguliser_dampening = hyperparameters['reguliser_dampening']
  learning_rate = hyperparameters['learning_rate']
  model = SGDClassifier(loss='hinge', penalty='l2',
                        alpha=reguliser_dampening, verbose=0,
                        learning_rate='constant', eta0=learning_rate)

  # Train the classifier
  model.fit(X_train, y_train)

  # Calculate the training accuracy
  training_accuracy = np.sum(model.predict(X_train) == y_train)/len(y_train)

  # Store the hyperparameters if they are better than what we have found before
  if best_hyperparameters is None or best_hyperparameters[1] < training_accuracy:
    best_hyperparameters = (hyperparameters, training_accuracy)
  print("%.5f\t\t%.5f\t\t%.1f%%" % (
      hyperparameters['learning_rate'], hyperparameters['reguliser_dampening'], 100*training_accuracy))

best_learning_rate = best_hyperparameters[0]['learning_rate']
best_reguliser_dampening = best_hyperparameters[0]['reguliser_dampening']
print("Best parameters: %.5f, %.5f" %
      (best_learning_rate, best_reguliser_dampening))


Learning rate:	Reg.dampening:	Training set accuracy:
0.03071		0.00010		83.9%
0.03071		3.00000		21.7%
0.09655		3.00000		17.6%
0.00031		0.00977		55.6%
3.00000		0.00977		28.0%
0.30353		0.30353		20.1%
0.00311		3.00000		28.2%
0.00031		0.00010		64.9%
0.09655		0.95425		24.6%
0.30353		0.09655		21.5%
0.00010		0.00010		53.2%
0.95425		0.00031		61.7%
0.30353		0.00977		43.3%
0.03071		0.95425		28.2%
0.00977		0.00977		65.3%
Best parameters: 0.03071, 0.00010


In [55]:
# Set up the classifier
model = SGDClassifier(loss='hinge', penalty='l2',
                      alpha=best_reguliser_dampening, verbose=1,
                      learning_rate='constant', eta0=best_learning_rate)

# Train on all the non-test data
model.fit(X_train, y_train)

# Run prediction on the test set
test_accuracy = np.sum(model.predict(X_test) == y_test)/len(y_test)

print("Test set accuracy %.1f%%" % (100*test_accuracy))


-- Epoch 1
Norm: 17.35, NNZs: 107689, Bias: -0.224802, T: 56821, Avg. loss: 0.227361
Total training time: 0.07 seconds.
-- Epoch 2


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Norm: 23.97, NNZs: 110026, Bias: -0.219274, T: 113642, Avg. loss: 0.125010
Total training time: 0.15 seconds.
-- Epoch 3
Norm: 27.07, NNZs: 110619, Bias: -0.208833, T: 170463, Avg. loss: 0.096802
Total training time: 0.21 seconds.
-- Epoch 4
Norm: 28.74, NNZs: 110933, Bias: -0.202076, T: 227284, Avg. loss: 0.085154
Total training time: 0.29 seconds.
-- Epoch 5
Norm: 29.72, NNZs: 111144, Bias: -0.199312, T: 284105, Avg. loss: 0.079047
Total training time: 0.34 seconds.
-- Epoch 6
Norm: 30.39, NNZs: 111311, Bias: -0.197470, T: 340926, Avg. loss: 0.075458
Total training time: 0.40 seconds.
-- Epoch 7
Norm: 30.90, NNZs: 111408, Bias: -0.197470, T: 397747, Avg. loss: 0.073059
Total training time: 0.46 seconds.
-- Epoch 8
Norm: 31.25, NNZs: 111491, Bias: -0.203305, T: 454568, Avg. loss: 0.071286
Total training time: 0.49 seconds.
-- Epoch 9
Norm: 31.48, NNZs: 111528, Bias: -0.195627, T: 511389, Avg. loss: 0.069913
Total training time: 0.54 seconds.
-- Epoch 10
Norm: 31.70, NNZs: 111592, Bias

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.5s finished


In [56]:
# Load the new dataset
with open("data_json/SubtaskB/subtaskB_dev.jsonl", "r") as f:
    lines = f.readlines()

# Parse each line as a separate JSON object
data = []
for line in lines:
    obj = json.loads(line)
    data.append(obj)

# Convert the list of JSON objects to a pandas DataFrame
dev_df = pd.DataFrame(data)

# Vectorize the text data using TfidfVectorizer
X_dev = vectorizer.transform(dev_df['text'])
y_dev = label_encoder.transform(dev_df['model'])

# Predict the labels for the new dataset
new_predictions = clf.predict(X_dev)

# Store the predictions in a separate jsonl file
predictions = list(zip(dev_df['id'], new_predictions))
predictions_df = pd.DataFrame(predictions, columns=['id', 'label'])
predictions_df.to_json('statistics/SGD_B_outputs/dev_predictions.jsonl',
                       lines=True, orient='records')


In [57]:
classification_report2_df = pd.DataFrame(
    classification_report(y_dev, new_predictions, output_dict=True)).transpose()
classification_report2_df.to_csv(
    'statistics/SGD_B_outputs/classification_report2.csv', index=False)

print(classification_report(y_dev, new_predictions))


              precision    recall  f1-score   support

           0       0.76      1.00      0.87       500
           1       0.57      0.49      0.53       500
           2       0.27      0.09      0.13       500
           3       0.36      0.65      0.47       500
           4       0.17      0.18      0.18       500
           5       0.65      0.44      0.53       500

    accuracy                           0.48      3000
   macro avg       0.47      0.48      0.45      3000
weighted avg       0.47      0.47      0.45      3000

