In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from math import sqrt
from sklearn import metrics

In [None]:
train_df = pd.read_csv('data/train.csv', index_col=0)
test_df = pd.read_csv('data/test.csv', index_col=0)

In [None]:
test_df.head()

Unnamed: 0,label,title,description
0,2,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,3,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,3,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,3,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,3,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [None]:
len(train_df), len(test_df)

(12105, 7600)

In [None]:
import typing as th
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import string


VECTOR_SIZE = 128
NUMERICS = ''.join(str(i) for i in range(10))
TRANSLATOR = str.maketrans('', '', NUMERICS + string.punctuation)


class Preprocessor(BaseEstimator, TransformerMixin):
    def init(self):
        pass
    
    def fit(self, x, y=None, **params):
        return self
    
    def fit_transform(self, x, y=None, **params):
        return self.fit(x, y, **params).transform(x)
    
    def transform(self, x):
        x_copy = x.copy()
        x_copy['title'] = (x['title'].map(str)).apply(
            lambda item: item.translate(TRANSLATOR).lower()                       
        )
        x_copy['description'] = (x['description'].map(str)).apply(              
            lambda item: item.translate(TRANSLATOR).lower()
        )
        x_copy['combination'] = (x['title'].map(str) + ' ' + x['description'].map(str)).apply(
            lambda item: item.translate(TRANSLATOR).lower()                       
        )
        x_copy['title-split'] = x_copy['title'].apply(lambda item: item.split())                
        x_copy['description-split'] = x_copy['description'].apply(lambda item: item.split())
        x_copy['combination-split'] = x_copy['combination'].apply(lambda item: item.split())    
        return x_copy


title_tf_idf_vectorizer = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('pca', TruncatedSVD())
])

description_tf_idf_vectorizer = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('pca', TruncatedSVD())
])

combination_tf_idf_vectorizer = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('pca', TruncatedSVD())
])


class Vectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=VECTOR_SIZE):
        self.n_components = n_components
        
        # title parameters
        self.title_tfidf = dict()
        
        # description parameters
        self.description_tfidf = dict()

        # combination parameters
        self.combination_tfidf = dict()
        
        # title vectorizers
        self.title_tfidf_vectorizer = title_tf_idf_vectorizer
        self.title_tfidf_vectorizer.set_params(**self.title_tfidf)
        
        # description vectorizers
        self.description_tfidf_vectorizer = description_tf_idf_vectorizer
        self.description_tfidf_vectorizer.set_params(**self.description_tfidf)

        # combination vectorizers
        self.combination_tfidf_vectorizer = description_tf_idf_vectorizer
        self.combination_tfidf_vectorizer.set_params(**self.description_tfidf)
        
    def fit(self, x, y=None):
        # finalizing title parameters
        self.title_tfidf['pca__n_components'] = self.title_tfidf.get('pca__n_components', self.n_components)
        self.title_tfidf_vectorizer.set_params(**self.title_tfidf)
        
        # finalizing description parameters
        self.description_tfidf['pca__n_components'] = self.description_tfidf.get('pca__n_components', self.n_components)
        self.description_tfidf_vectorizer.set_params(**self.description_tfidf)

        # finalizing combination parameters
        self.combination_tfidf['pca__n_components'] = self.combination_tfidf.get('pca__n_components', self.n_components)
        self.combination_tfidf_vectorizer.set_params(**self.combination_tfidf)
        
        # fitting models
        self.title_tfidf_vectorizer.fit(x['title'])
        self.description_tfidf_vectorizer.fit(x['description'])
        self.combination_tfidf_vectorizer.fit(x['combination'])
        return self

    def transform(self, x):
        x_copy = x.copy()
        x_copy['vec_1'] = self.title_tfidf_vectorizer.transform(x['title']).tolist()
        x_copy['vec_2'] = self.description_tfidf_vectorizer.transform(x['description']).tolist()
        x_copy['vec_3'] = self.combination_tfidf_vectorizer.transform(x['combination']).tolist()
        x_copy = x_copy.drop(columns=['title',
                                      'description',
                                      'title-split',
                                      'description-split',
                                      'combination',
                                      'combination-split'])
        return x_copy

    def fit_transform(self, x, y=None, **fit_params):
        return self.fit(x, y).transform(x)

In [None]:
preprocessor = Pipeline(steps=[
    ('preprocess', Preprocessor()),
    ('vectorizer', Vectorizer())
])
preprocessor.fit(train_df)
train_data = preprocessor.transform(train_df).dropna()
test_data = preprocessor.transform(test_df).dropna()

train_data.head(2)

Unnamed: 0,label,vec_1,vec_2,vec_3
0,2,"[0.08125303122076631, -0.006681200848573755, 0...","[0.2418105510200397, 0.1346270119944103, 0.005...","[0.23430001123054556, 0.22338673759683889, -0...."
1,2,"[0.15750196368965114, -0.11621212954560559, -0...","[0.16799963763805328, 0.13156697017078794, 0.0...","[0.1605558979445048, 0.17542592883217825, 0.01..."


In [None]:
from sklearn.model_selection import train_test_split

train_split, val_split = train_test_split(train_data, test_size=0.1)

train_split = train_split.dropna()
val_split = val_split.dropna()

In [None]:
train_split.shape, val_split.shape

((10894, 4), (1211, 4))

In [None]:
#converting datas to numpy because it is easier.
trlable = train_split['label'].to_numpy()
trvec1 = np.array(train_split ['vec_1'].to_list())
trvec2 = np.array(train_split ['vec_2'].to_list())
trvec3 = np.array(train_split ['vec_3'].to_list())
telable = test_data ['label'].to_numpy()
tevec1 = np.array(test_data ['vec_1'].to_list())
tevec2 = np.array(test_data ['vec_2'].to_list())
tevec3 = np.array(test_data ['vec_3'].to_list())
vlable = val_split['label'].to_numpy()
vvec1 = np.array(val_split['vec_1'].to_list())
vvec2= np.array(val_split['vec_2'].to_list())
vvec3 = np.array(val_split['vec_3'].to_list())

In [None]:
class Naive_bayes_classifier ():
    # here we are going to seperate the given dataset to a subset of data for each class 
    def seperation(self, X, y):
        sep_classes = {}
        for i in range(len(X)):
            feature_values = X[i]
            class_name = y[i]
            if class_name not in sep_classes:
                sep_classes[class_name] = []
            sep_classes[class_name].append(feature_values)
        return sep_classes
    def neededvalues (self, X):
        # here we calculate the standard deviation and mean
        for feature in zip(*X):
            yield {
                'std' : np.std(feature),
                'mean' : np.mean(feature)}
    def fit (self, X, y):
        #training the model
        sep_classes = self.seperation(X, y)
        self.class_summary = {}
        for class_name, feature_values in sep_classes.items():
            self.class_summary[class_name] = {
                'prior_proba': len(feature_values)/len(X),
                'summary': [i for i in self.neededvalues(feature_values)],
            }
        return self.class_summary
    
    def gaussian_distribution(self, x, mean, std):
        exponent = np.exp(-((x-mean)**2 / (2*std**2)))
        return exponent / (np.sqrt(2*np.pi)*std)

    
    def predict(self, X):
       # List of predicted class for each row of data set
        
        predict = []
        for row in X:
            joint_proba = {}
            for class_name, features in self.class_summary.items():
                total_features =  len(features['summary'])
                likelihood = 1
                for idx in range(total_features):
                    feature = row[idx]
                    mean = features['summary'][idx]['mean']
                    stdev = features['summary'][idx]['std']
                    normal_proba = self.gaussian_distribution(feature, mean, stdev)
                    likelihood *= normal_proba
                prior_proba = features['prior_proba']
                joint_proba[class_name] = prior_proba * likelihood

            pre = max(joint_proba, key= joint_proba.get)
            predict.append(pre)

        return predict


In [None]:
#vec1
nv_vec1 =  Naive_bayes_classifier()
nv_vec1.fit(trvec1, trlable)
predictions = nv_vec1.predict(vvec1)
print(accuracy_score(vlable, predictions))


0.47894302229562347


In [None]:
#vec2
nv_vec2 =  Naive_bayes_classifier()
nv_vec2.fit(trvec2, trlable)
predictions = nv_vec2.predict(vvec2)
print(accuracy_score(vlable, predictions))

0.6837324525185797


In [None]:
#vec3
nv_vec3 =  Naive_bayes_classifier()
nv_vec3.fit(trvec3, trlable)
predictions = nv_vec2.predict(vvec3)
print(accuracy_score(vlable, predictions))

0.6829066886870355


In [None]:
class KNN():
    def __init__(self,k,X,y):
        self.k = k
        self.train_features = X
        self.trlable = y 
    def euclideand(raw1, raw2):
        for i in range (len(raw1)-1):
            distance = sqrt(((raw1[i]-raw2[i])**2).sum(axis=0))
        return distance
    def predict(self, X):
        predictions = []
        for index, pred in enumerate (X):
            distance =  KNN.euclideand(self.train_features, pred)
            neighbors = list(self.trlable[np.argsort(distance)[:self.k]])
            prediction = max(set(neighbors), key = neighbors.count)
            predictions.append(prediction)
        return predictions

In [None]:
knnvec1 = KNN(20, trvec1, trlable)
predictions = knnvec1.predict(vvec1)
print(accuracy_score(vlable, predictions))

IndexError: index 128 is out of bounds for axis 0 with size 128

In [None]:
knnvec2 = KNN(20, trvec2, trlable)
predictions = knnvec2.predict(vvec2)
print(accuracy_score(vlable, predictions))

IndexError: index 128 is out of bounds for axis 0 with size 128

In [None]:
knnvec3 = KNN(20, trvec3, trlable)
predictions = knnvec3.predict(vvec3)
print(accuracy_score(vlable, predictions))

0.2336911643270025


In [None]:
svmvec1 = SVC()
svmvec1.fit(trvec1, trlable)
predictions = svmvec1.predict(vvec1)
print(accuracy_score(vlable, predictions))

0.7027250206440958


In [None]:
svmvec2 = SVC()
svmvec2.fit(trvec2, trlable)
predictions = svmvec2.predict(vvec2)
print(accuracy_score(vlable, predictions))

0.8554913294797688


In [None]:
svmvec3 = SVC()
svmvec3.fit(trvec3, trlable)
predictions = svmvec3.predict(vvec3)
print(accuracy_score(vlable, predictions))

0.8678777869529315


In [None]:
#evaluation process
def score(true,predictions):
    confusion_matrix= metrics.confusion_matrix(true, predictions)
    print(f"'accuracy': {metrics.accuracy_score(true, predictions)}")
    print(f"'precision': {metrics.precision_score(true, predictions, average='macro')}")
    print(f"'recall': {metrics.recall_score(true, predictions, average='macro')}")
    print(f"'f1': {metrics.f1_score(true, predictions, average='macro')}")
    print(f"confusion_matrix: {confusion_matrix}")
    return score
        


In [None]:
print("for naive bayes:")
predictions = nv_vec3.predict(tevec3)
print(score(telable, predictions))
#predictions = knnvec3.predict(tevec3)
#print(score(telable, predictions))
print("for SVM:")
predictions = svmvec3.predict(tevec3)
print(score(telable, predictions))


for naive bayes:
'accuracy': 0.6617105263157895
'precision': 0.6863368875653079
'recall': 0.6617105263157895
'f1': 0.6596897190508548
confusion_matrix: [[1103  548  119  130]
 [ 145 1646   74   35]
 [ 103  339 1073  385]
 [  80  403  210 1207]]
<function score at 0x7f7dda27f8b0>
for SVM:
'accuracy': 0.8626315789473684
'precision': 0.8621943157507272
'recall': 0.8626315789473684
'f1': 0.8622266145519871
confusion_matrix: [[1644   86   93   77]
 [  46 1795   24   35]
 [  89   38 1537  236]
 [  91   55  174 1580]]
<function score at 0x7f7dda27f8b0>


In [None]:
def purity_score(y_true, y_pred):
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [None]:
from sklearn.cluster import KMeans
score = []
Cluster = KMeans(n_clusters=4)
Cluster.fit(trvec1)
score.append(purity_score(vlable, Cluster.predict(vvec1)))
print(sum(score) / len(score))

0.29975227085053674


In [None]:
from sklearn.cluster import KMeans
score = []
Cluster = KMeans(n_clusters=4)
Cluster.fit(trvec2)
score.append(purity_score(vlable, Cluster.predict(vvec2)))
print(sum(score) / len(score))

0.467382328654005


In [None]:
from sklearn.cluster import KMeans
score = []
Cluster = KMeans(n_clusters=4)
Cluster.fit(trvec3)
score.append(purity_score(vlable, Cluster.predict(vvec3)))
print(sum(score) / len(score))

0.4186622625928984


In [None]:
from sklearn.cluster import AgglomerativeClustering
pred_lable = AgglomerativeClustering(n_clusters=4).fit_predict(trvec1)
print(purity_score(trlable, pred_lable))
pred_lable = AgglomerativeClustering(n_clusters=4).fit_predict(trvec2)
print(purity_score(trlable, pred_lable))
pred_lable = AgglomerativeClustering(n_clusters=4).fit_predict(trvec3)
print(purity_score(trlable, pred_lable))

0.30163392693225627
0.4746649531852396
0.41949697080961995


In [None]:
from sklearn.metrics.cluster import adjusted_rand_score
model = KMeans(n_clusters=4)
model.fit(trvec2)
purity_score(telable, model.predict(tevec2))
adjusted_rand_score(telable, model.predict(tevec2))
print('Purity Score for KMeans Clustering:', purity_score(telable, model.predict(tevec2)))
print('Adjusted Rand Index for KMeans Clustering:', adjusted_rand_score(telable, model.predict(tevec2)))

pred_labels = AgglomerativeClustering(n_clusters=4).fit_predict(tevec2)
print('Purity Score for KMeans Clustering:', purity_score(tevec2, pred_labels))
print('Adjusted Rand Index for KMeans Clustering:', adjusted_rand_score(telabel, pred_labels))