In [53]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

from time import time # needed to measure the elapsed time

In [54]:

def jaccard_distance(vector1:csr_matrix, vector2:csr_matrix):
    """
    Compute Jaccard distance between two sparse vectors/matrices.
    :vector1: First sparse vector, containing float numbers.
    :vector2: Second sparse vector, containing float numbers
    :return: Jaccard distance score(1-Jaccard similarity).
    """
    # we make the vectors/matrices dense so that we can work on them
    v1dense=vector1.todense()
    v2dense=vector2.todense()
    
    # we turn them into boolean vectors, in which every value is either True or False
    v1bool=v1dense.astype(bool)
    v2bool=v2dense.astype(bool)

    # The intersection of the two boolean vectors is the result of the logical AND
    intersection = np.logical_and(v1bool,v2bool) 
    
    # The union of the two boolean vectors is the result of the logical OR
    union = np.logical_or(v1bool,v2bool)
    
    # The size of the intersection is the number of all the non-Zero(non-False) items
    intersection_size=np.count_nonzero(intersection)
    # The size of the union is the number of all the non-Zero(non-False) items
    union_size=np.count_nonzero(union)

    return (1- intersection_size / union_size) if union_size > 0 else 0


In [None]:
print("This is the answer to part 1 of the exercise\n")

# we read the train and the test datasets from the respective files
train_ds_comma=pd.read_csv("train.csv")
test_ds_comma=pd.read_csv("test_without_labels.csv")


fraction=0.15 # the fraction of the datasets, used to create smaller, faster to work on datasets

# Split the training dataset into a smaller one
train_subset, _ = train_test_split(
    train_ds_comma, 
    train_size=fraction, 
    stratify=train_ds_comma['Label'],  # Ensure stratified sampling
    random_state=42
)

train_subset.to_csv('train_subset.csv', index=False)

# Split the testing dataset into a smaller one
test_subset=test_ds_comma.sample(frac=fraction,random_state=42)
test_subset.to_csv('test_subset.csv',index=False)

# the two sets that we are going to work on
train_subset=pd.read_csv('train_subset.csv')
test_subset=pd.read_csv('test_subset.csv')


# If you don't want to run for the complete dataset, but just for subsets, comment the next two lines
train_subset=train_ds_comma
test_subset=test_ds_comma


This is the answer to part 1 of the exercise



In [None]:
# Preprocessing, tokenization and removal of stop words via Vectorizer() of sklearn
# This process is called vectorization, in which documents are transformed into numerical represantations
# It was observed that TfidVectorizer works better in combination with LinearSVC, which is the recommended SVM method for large number of samples 

vctrz= TfidfVectorizer(stop_words='english')
x_train=vctrz.fit_transform(train_subset['Title']+train_subset['Content'])
y_train=train_subset['Label']
x_test=vctrz.transform(test_subset['Title']+ test_subset['Content'])

# The classifiers we are going to compare
classifiers={'Random Forest': RandomForestClassifier(),
             'SVM':LinearSVC()
             }
# We have discovered that kNN is too slow, even for vectorized operations. With that in mind and with some a posteriori knowledge that can be
# found in the report , we have decided to include it only if the size of the subsets is relatively low.
if(fraction<0.05):
    classifiers["kNN"]=KNeighborsClassifier(n_neighbors=7,algorithm='brute',metric=jaccard_distance)

# the results of the comparison
results={}
times={}
# we iterate over the set of the classifiers and we perform cross validation
for name,clfr in classifiers.items():
    print("\nTesting:",name)
    start_time = time()
    scores=cross_val_score(clfr,x_train,y_train,cv=5,scoring='accuracy')
    results[name]=scores.mean() # the score of the validation of the classifier clfr
    times[name]=time() - start_time




Testing: Random Forest

Testing: SVM

Testing: kNN


In [None]:
# we sort the results we got in descending order, so that we can get the best method
sorted_results=dict(sorted(results.items(), key=lambda item: item[1],reverse=True))
    
print(f"5-Fold Cross-Validation Results for{fraction*100: .2f}% of the two sets")
for name,res in sorted_results.items():
    print(f"  ->{name}:{res*100: .2f}% (Runtime {times[name]: .2f} secs)")


# the best method, according to the 5-Fold Validation we performed
best_fit_name= list(sorted_results.keys())[0]
print("The best method, that we will use to predict, is:",best_fit_name)

# we apply our best method on the training set
best_fit_method=classifiers[best_fit_name]
best_fit_method.fit(x_train,y_train)

# we predict on the test set, using our best method
test_subset['Predicted'] = best_fit_method.predict(x_test)
test_subset[['Id', 'Predicted']].to_csv('testSet_categories.csv', index=False) # we output the predictions in the respective .csv fil


5-Fold Cross-Validation Results for 1.00% of the two sets
  ->SVM: 90.06% (Runtime  0.15 secs)
  ->kNN: 85.14% (Runtime  275.00 secs)
  ->Random Forest: 83.08% (Runtime  6.54 secs)
The best method is: SVM
