In [50]:
import csv
import os
import math
import copy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import gensim
import spacy

In [51]:
np.random.seed(42)

In [52]:
def readData(filename, inputLabels, outputLabels = ['emailType']):
    df = pd.read_csv(filename)
    #df.fillna(df.mean(), inplace=True)

    inputs = [] 
    target = [] 

    for _, row in df.iterrows():
        inputs.append([row[x] for x in inputLabels])
        target.append([row[x] for x in outputLabels])
    
    if len(inputLabels) == 1:
        inputs = np.squeeze(inputs)
    if len(outputLabels) == 1:
        target = np.squeeze(target)
    return inputs, target, list(set(target)) 

In [53]:
file = os.path.join(os.getcwd(), 'data', 'spam.csv')
inputsLabels = ['emailText']
outputsLabels = ['emailType']
inputs, outputs, labelNames = readData(file, inputsLabels, outputsLabels)

In [54]:
np.random.seed(5)
noSamples = len(inputs)
indexes = [i for i in range(noSamples)]
trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace = False)
testSample = [i for i in indexes  if not i in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
testInputs = [inputs[i] for i in testSample]
testOutputs = [outputs[i] for i in testSample]

print(trainInputs[:3])

['Probably, want to pick up more?', "No go. No openings for that room 'til after thanksgiving without an upcharge.", "Fuck babe ... I miss you already, you know ? Can't you let me send you some money towards your net ? I need you ... I want you ... I crave you ..."]


In [55]:
vectorizer = CountVectorizer()

trainFeatures = vectorizer.fit_transform(trainInputs)
testFeatures = vectorizer.fit_transform(testInputs)


print('vocabulary: ', vectorizer.get_feature_names_out()[:10])
print('features: ', trainFeatures.toarray()[:3][:10])

vocabulary:  ['00' '000' '008704050406' '01223585334' '02' '0207' '02072069400' '03'
 '04' '0578']
features:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [56]:
vectorizer = TfidfVectorizer(max_features = 50)

#trainFeatures = vectorizer.fit_transform(trainInputs)
#testFeatures = vectorizer.transform(testInputs)

#print('vocabulary: ', vectorizer.get_feature_names_out()[:10])
#print('features: ', trainFeatures.toarray()[:3][:10])

In [57]:
modelPath = os.path.join(os.getcwd(), 'models', 'GoogleNews-vectors-negative300.bin')

word2vecModel = gensim.models.KeyedVectors.load_word2vec_format(modelPath, binary = True)
print(word2vecModel.most_similar('support'))

[('supporting', 0.6251285076141357), ('suport', 0.6071150302886963), ('suppport', 0.6053199768066406), ('Support', 0.6044272780418396), ('supported', 0.6009396910667419), ('backing', 0.6007589101791382), ('supports', 0.5269277095794678), ('assistance', 0.520713746547699), ('sup_port', 0.5192490220069885), ('supportive', 0.5110024809837341)]


In [58]:
def featureComputation(model, data):
    features = []
    phrases = [phrase.split() for phrase in data]

    for phrase in phrases:
        vectors = [model[word] for word in phrase if (len(word) > 2) and (word in model.key_to_index)]
        if len(vectors) == 0:
            result = [0.0] * model.vector_size
        else:
            result = np.sum(vectors, axis = 0) / len(vectors)
        features.append(result)
    return features

trainFeatures = featureComputation(word2vecModel, trainInputs)
testFeatures = featureComputation(word2vecModel, testInputs)

In [59]:
unsupervisedClassifier = KMeans(n_clusters=2, random_state=0)
unsupervisedClassifier.fit(trainFeatures)



In [60]:
predicted = unsupervisedClassifier.predict(testFeatures)
predictedTestOutputs = [labelNames[value] for value in predicted]

In [61]:
print("acc: ", accuracy_score(testOutputs, predictedTestOutputs))

acc:  0.8582959641255605
