In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from datasketch import MinHash, MinHashLSH


from time import time # needed to measure the elapsed time
from itertools import zip_longest
from collections import Counter

In [2]:
def text_to_shingles(text, k=5):
    words = text.split()
    return set(zip_longest(*[words[i:] for i in range(k)], fillvalue=''))


def jaccard_similarity(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

In [3]:
print("This is the answer to part 1 of the exercise\n")

# we read the train and the test datasets from the respective files
train_ds_comma=pd.read_csv("train.csv")
test_ds_comma=pd.read_csv("test_without_labels.csv")


fraction=0.10 # the fraction of the datasets, used to create smaller, faster to work on datasets

# Split the training dataset into a smaller one
train_subset, _ = train_test_split(
    train_ds_comma, 
    train_size=fraction, 
    stratify=train_ds_comma['Label'],  # Ensure stratified sampling
    random_state=42
)

train_subset.to_csv('train_subset.csv', index=False)

# Split the testing dataset into a smaller one
test_subset=test_ds_comma.sample(frac=fraction,random_state=42)
test_subset.to_csv('test_subset.csv',index=False)

# the two sets that we are going to work on
train_subset=pd.read_csv('train_subset.csv')
test_subset=pd.read_csv('test_subset.csv')

This is the answer to part 1 of the exercise



In [4]:
# Preprocessing, tokenization and removal of stop words via Vectorizer() of sklearn
# This process is called vectorization, in which documents are transformed into numerical represantations
# It was observed that TfidVectorizer works better in combination with LinearSVC, which is the recommended SVM method for large number of samples 

vctrz= TfidfVectorizer(stop_words='english')
x_train=vctrz.fit_transform(train_subset['Title']+train_subset['Content'])
y_train=train_subset['Label']
x_test=vctrz.transform(test_subset['Title']+ test_subset['Content'])

# The classifiers we are going to compare
classifiers={'Random Forest': RandomForestClassifier(),
            #  'SVM': SVC(),
             'SVM':LinearSVC()}

# the results of the comparison
results={}

# we iterate over the set of the classifiers and we perform cross validation
for name,clfr in classifiers.items():
    print("\nTesting:",name)
    scores=cross_val_score(clfr,x_train,y_train,cv=5,scoring='accuracy')
    results[name]=scores.mean() # the score of the validation of the classifier clfr




Testing: Random Forest

Testing: SVM


In [5]:
# 5-Fold cross validation of KNN
if(fraction<=0.15):
    skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    X=train_subset['Title']+train_subset['Content']
    y=train_subset['Label']

    accur=[]

    for train_index, val_index in skf.split(X, y):
        # Split into train and validation sets
        X_train_knn, X_val_knn = X.iloc[train_index], X.iloc[val_index]
        y_train_knn, y_val_knn = y.iloc[train_index], y.iloc[val_index]



        train_shingles = X_train_knn.apply(text_to_shingles)
        val_shingles = X_val_knn.apply(text_to_shingles)

        # Perform brute-force K-NN for validation
        predictions = []
        k=7
        for val_doc in val_shingles:
            similarities = [(i, jaccard_similarity(val_doc,train_doc))
                                for i, train_doc in enumerate(train_shingles)]
            similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

            # Majority voting
            neighbor_labels = [y_train_knn.iloc[idx] for idx, _ in similarities]
            majority_label = Counter(neighbor_labels).most_common(1)[0][0]
            predictions.append(majority_label)

        # Compute accuracy
        accuracy = accuracy_score(y_val_knn, predictions)
        accur.append(accuracy)

    knn_acc=np.mean(accur)
else:
    knn_acc=0

In [6]:
results['kNN']=knn_acc

print(f"5-Fold Cross-Validation Results for {fraction*100}% of the two sets")
for name,res in results.items():
    if(res!=0):
        print(f"  ->{name}: {res*100} %")

# we sort the results we got in descending order, so that we can get the best method
sorted_results=dict(sorted(results.items(), key=lambda item: item[1],reverse=True))

# the best method, according to the 5-Fold Validation we performed
best_fit_name= list(sorted_results.keys())[0]
print("The best method is:",best_fit_name)

# we apply our best method on the training set
best_fit_method=classifiers[best_fit_name]
best_fit_method.fit(x_train,y_train)

# we predict on the test set, using our best method
test_subset['Predicted'] = best_fit_method.predict(x_test)
test_subset[['Id', 'Predicted']].to_csv('testSet_categories.csv', index=False) # we output the predictions in the respective .csv fil


5-Fold Cross-Validation Results for 10.0% of the two sets
  ->Random Forest: 89.20305515201721 %
  ->SVM: 95.50057429174021 %
  ->kNN: 71.10631800954886 %
The best method is: SVM


In [1]:
print("This is the stsrt of part 2")

This is the stsrt of part 2
