In [7]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from datasketch import MinHash, MinHashLSH
from scipy.sparse import csr_matrix

from time import time # needed to measure the elapsed time
from itertools import zip_longest
from collections import Counter
import re

In [8]:
def text_to_shingles(text, k=2):
    words = re.split(r'[,.\s!\n!"?;_:*@]',text)
    shingles = set(zip_longest(*[words[i:] for i in range(k)], fillvalue=''))
    # Filter out incomplete shingles containing the fillvalue
    return {shingle for shingle in shingles if '' not in shingle}

# this yields better results with high ks
# def text_to_shingles(text, k=5):
#     doc= re.sub('[,;.:?!@*&# ]','',text)
#     shingles = []
#     for i in range(0, len(doc) - k):
#         shingles.append(doc[i:i + k])
#     return set(shingles)

# def text_to_shingles(text,k=2):
#     words=text.lower()
#     return set(re.split(r'[,.\s!"?;_*@]',words))


def jaccard_distance(vector1, vector2):
    """
    Compute Jaccard distance between two sparse vectors/matrices.
    :vector1: First sparse vector, containing float numbers.
    :vector2: Second sparse vector, containing float numbers
    :return: Jaccard distance score(1-Jaccard similarity).
    """
    # we make the vectors/matrices dense so that we can work on them
    v1dense=vector1.todense()
    v2dense=vector2.todense()
    
    # we turn them into boolean vectors, in which every value is either True or False
    v1bool=v1dense.astype(bool)
    v2bool=v2dense.astype(bool)

    # The intersection of the two boolean vectors is the result of the logical AND
    intersection = np.logical_and(v1bool,v2bool) 
    
    # The union of the two boolean vectors is the result of the logical OR
    union = np.logical_or(v1bool,v2bool)
    
    # The size of the intersection is the number of all the non-Zero(non-False) items
    intersection_size=np.count_nonzero(intersection)
    # The size of the union is the number of all the non-Zero(non-False) items
    union_size=np.count_nonzero(union)

    return (1- intersection_size / union_size) if union_size > 0 else 0

# Jaccard similarity for sets
def jaccard_similarity_set(set1, set2):
    inter=0
    union=len(set1)+len(set2)
    for item in set1:
        if item in set2:
            inter +=1
            union -=1
    return inter / union

def preprocess_text(text):
    """
    Preprocess the input text by lowercasing and removing punctuation.
    :param text: The input text string.
    :return: Preprocessed text string.
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub('[.,!?:@%*()]', ' ', text)  # Remove punctuation
    return text

In [9]:
print("This is the answer to part 1 of the exercise\n")

# we read the train and the test datasets from the respective files
train_ds_comma=pd.read_csv("train.csv")
test_ds_comma=pd.read_csv("test_without_labels.csv")


fraction=0.50 # the fraction of the datasets, used to create smaller, faster to work on datasets

# Split the training dataset into a smaller one
train_subset, _ = train_test_split(
    train_ds_comma, 
    train_size=fraction, 
    stratify=train_ds_comma['Label'],  # Ensure stratified sampling
    random_state=42
)

train_subset.to_csv('train_subset.csv', index=False)

# Split the testing dataset into a smaller one
test_subset=test_ds_comma.sample(frac=fraction,random_state=42)
test_subset.to_csv('test_subset.csv',index=False)

# the two sets that we are going to work on
train_subset=pd.read_csv('train_subset.csv')
test_subset=pd.read_csv('test_subset.csv')

train_subset=train_ds_comma
test_subset=test_ds_comma


This is the answer to part 1 of the exercise



In [10]:
# Preprocessing, tokenization and removal of stop words via Vectorizer() of sklearn
# This process is called vectorization, in which documents are transformed into numerical represantations
# It was observed that TfidVectorizer works better in combination with LinearSVC, which is the recommended SVM method for large number of samples 

vctrz= TfidfVectorizer(stop_words='english')
# x_train=vctrz.fit_transform(train_subset['Title'].apply(preprocess_text)+train_subset['Content'].apply(preprocess_text))
x_train=vctrz.fit_transform(train_subset['Title']+train_subset['Content'])
# print(x_train.toarray())
y_train=train_subset['Label']
# x_test=vctrz.transform(test_subset['Title'].apply(preprocess_text)+ test_subset['Content'].apply(preprocess_text))
x_test=vctrz.transform(test_subset['Title']+ test_subset['Content'])
# The classifiers we are going to compare
classifiers={'Random Forest': RandomForestClassifier(),
             'SVM':LinearSVC()
             }
if(fraction<0.10):
    classifiers["kNN"]=KNeighborsClassifier(n_neighbors=7,algorithm='brute',metric=jaccard_distance)

# the results of the comparison
results={}
times={}
# we iterate over the set of the classifiers and we perform cross validation
for name,clfr in classifiers.items():
    print("\nTesting:",name)
    start_time = time()
    scores=cross_val_score(clfr,x_train,y_train,cv=5,scoring='accuracy')
    results[name]=scores.mean() # the score of the validation of the classifier clfr
    times[name]=time() - start_time




Testing: SVM


In [11]:
# 5-Fold cross validation of KNN
# if(fraction<=0.01 and 0):
#     skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#     X=train_subset['Title'].apply(preprocess_text)+train_subset['Content'].apply(preprocess_text)
#     y=train_subset['Label']

#     accur=[]


#     for train_index, val_index in skf.split(X, y):
#         # Split into train and validation sets
#         X_train_knn, X_val_knn = X.iloc[train_index], X.iloc[val_index]
#         y_train_knn, y_val_knn = y.iloc[train_index], y.iloc[val_index]



#         train_shingles = X_train_knn.apply(text_to_shingles)
#         val_shingles = X_val_knn.apply(text_to_shingles)

#         # print(train_shingles)
#         # print(val_shingles)
#         # Perform brute-force K-NN for validation
#         predictions = []
#         k=7
#         for val_doc in val_shingles:
#             similarities = [(i, jaccard_similarity(val_doc,train_doc))
#                                 for i, train_doc in enumerate(train_shingles)]
#             similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

#             # print(similarities)
#             # Majority voting
#             neighbor_labels = [y_train_knn.iloc[idx] for idx, _ in similarities]
#             majority_label = Counter(neighbor_labels).most_common(1)[0][0]
#             predictions.append(majority_label)

#         # Compute accuracy
#         accuracy = accuracy_score(y_val_knn, predictions)
#         accur.append(accuracy)

#     knn_acc=np.mean(accur)
# else:
#     knn_acc=0

In [None]:
# results['kNN']=knn_acc
sorted_results=dict(sorted(results.items(), key=lambda item: item[1],reverse=True))
    
print(f"5-Fold Cross-Validation Results for{fraction*100: .2f}% of the two sets")
for name,res in sorted_results.items():
    print(f"  ->{name}:{res*100: .2f}% (Runtime {times[name]: .2f} secs)")

# we sort the results we got in descending order, so that we can get the best method


# the best method, according to the 5-Fold Validation we performed
# best_fit_name='SVM'

best_fit_name= list(sorted_results.keys())[0]
print("The best method is:",best_fit_name)

# we apply our best method on the training set
best_fit_method=classifiers[best_fit_name]
best_fit_method.fit(x_train,y_train)

# we predict on the test set, using our best method
test_subset['Predicted'] = best_fit_method.predict(x_test)
test_subset[['Id', 'Predicted']].to_csv('testSet_categories.csv', index=False) # we output the predictions in the respective .csv fil


5-Fold Cross-Validation Results for 100.00% of the two sets
  ->SVM: 97.42% (Runtime  23.39 secs)
The best method is: SVM
