In [None]:
import pandas as pd

# Pre-Processing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Modelling
from sklearn.linear_model import LogisticRegression

# Evaluation
import plotly.figure_factory as ff
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, matthews_corrcoef, confusion_matrix
import datetime

In [None]:
df = pd.read_json('../Data/Processed/processed_binary.json')
df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['target_binary'], test_size=0.25, random_state=14)

# Initialise Vectoriser
vectoriser = TfidfVectorizer(stop_words="english")

In [None]:
X_train_vectorised = vectoriser.fit_transform(X_train)
X_test_vectorised = vectoriser.transform(X_test)
print(X_train_vectorised.shape)

In [None]:
# Initialise Model
clf = LogisticRegression(max_iter=10000, penalty='l2', C=1.0)

# Fit Model to Vectorised Training Data
start_time = datetime.datetime.now()
clf.fit(X_train_vectorised, y_train)
end_time = datetime.datetime.now()

training_time = (end_time - start_time).total_seconds()

In [None]:
clf.score(X_test_vectorised,  y_test)

In [None]:
# Train data predictions
train_predictions = clf.predict(X_train_vectorised)
train_accuracy = accuracy_score(y_train, train_predictions)

# Test data predictions
test_predictions = clf.predict(X_test_vectorised)
test_accuracy = accuracy_score(y_test, test_predictions)

decision_scores = clf.decision_function(X_test_vectorised)
auc = roc_auc_score(y_test, decision_scores)

mcc = matthews_corrcoef(y_test, test_predictions)

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')
print(f'Training Time: {training_time}')
print(f'AUC: {auc}')
print(f'Matthews Correlation Coefficient: {mcc}')

In [None]:
def plot_confusion_matrix_and_report(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    class_report = classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'])
    
    fig = ff.create_annotated_heatmap(z=cm, x=['Predicted 0', 'Predicted 1'], y=['Actual 0', 'Actual 1'])
    fig.update_layout(title=title + f' : Accuracy: {accuracy:.2f}', autosize=False, width=500, height=400)
    fig.show()
    
    print("Classification Report:")
    print(class_report)

In [None]:
plot_confusion_matrix_and_report(y_test, test_predictions, 'TF-IDF Binary')

In [None]:
internal_val_df = pd.read_json('../Data/Processed/processed_binary_val.json')
internal_val_df

In [None]:
internal_vectorised = vectoriser.transform(internal_val_df['tweet'])
internal_val_accuracy = clf.score(internal_vectorised, internal_val_df['target_binary'])
print(f'Validation Set (COVID) Accuracy: {internal_val_accuracy}')

In [None]:
val_predicted_labels = clf.predict(internal_vectorised)
val_true_labels = internal_val_df['target_binary']

i_val_mcc = matthews_corrcoef(val_true_labels, val_predicted_labels)
print(f'Matthews Correlation Coefficient: {i_val_mcc}')

In [None]:
external_val_df = pd.read_json('../Data/Cross_Validation/COVID_processed.json')
external_val_df

In [None]:
external_vectorised = vectoriser.transform(external_val_df['tweet'])
external_val_accuracy = clf.score(external_vectorised, external_val_df['target'])
print(f'Validation Set (COVID) Accuracy: {external_val_accuracy}')

In [None]:
val_predicted_labels = clf.predict(external_vectorised)
val_true_labels = external_val_df['target']

e_val_mcc = matthews_corrcoef(val_true_labels, val_predicted_labels)
print(f'Matthews Correlation Coefficient: {e_val_mcc}')

In [None]:
total_mcc = e_val_mcc + i_val_mcc 

data = {
    "Model": "TF-IDF (Binary)",
    "Train Accuracy": [train_accuracy],
    "Test Accuracy": [test_accuracy],
    "MCC": [mcc],
    "AUC": [auc],
    "Training Time (s)": [training_time],
    "(I) Validation Accuracy": [internal_val_accuracy],
    "(I) Validation MCC": [i_val_mcc],
    "(E) Validation Accuracy": [external_val_accuracy],
    "(E) Validation MCC": [e_val_mcc],
    "Total Validation MCC": [total_mcc]
}


results = pd.DataFrame(data)
results.to_csv('../Results/TF-IDF_binary.csv', index=False)