In [None]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
import nltk
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
import statistics 
import matplotlib.pyplot as plt
from scipy import stats


In [None]:
dataset = pd.read_csv('datasets/imdb.csv')
dataset.head()
dataset = dataset[:1000]

In [None]:
# removeMostUsed tells if words used more than the provided percentage should be removed.
# if false, words used less then the percentage should be removed
def removeWords(tmpDataSet, removeMostUsed=True, percentage=0.5):
    numberEntries = len(tmpDataSet)
    neededAmount = numberEntries * percentage
    
    removeWords = {}
    for text in tmpDataSet:
        thisremoveWords = {}
        for word in text:
            if (word not in thisremoveWords):
                thisremoveWords[word]=1
                if (word not in removeWords):
                    removeWords[word]=1
                else:
                    removeWords[word]+=1
    # accedp only words used the needed amount or less
    if (removeMostUsed == True):
        removeWords = dict((k, v) for k, v in removeWords.items() if v >= neededAmount)
    # accedp only words used the needed amount or more
    else:
        removeWords = dict((k, v) for k, v in removeWords.items() if v <= neededAmount)
    newtmpDataSet = []
    for text in tmpDataSet:
        cleaned_list = [word for word in text if word not in removeWords]
        newtmpDataSet.append(cleaned_list)

    return (newtmpDataSet) 

In [None]:
def runMultipleTimes(dataset):
    f = open("w2v.txt", "w")
    percentageAndMostused = [[True, 1.0], [True, .9], [True, .8], [True, .7], [False, .3], [False, .2], [False, .1]]

    avgScores = []
    allScores = []
    sdScores = []
    avgF1Scores = []
    sdF1Scoress = []
    allF1Scores = []
    allLoss = []
    for elem in percentageAndMostused:
        X = [sent for sent in dataset['review']]
        X = [simple_preprocess(sent, deacc=True) for sent in X]
        X1 = removeWords(X, removeMostUsed=elem[0], percentage=elem[1])

        model = Word2Vec(X1, min_count=1, size=300, window=10)
       
        avgScore = 0
        avgF1Score = 0
        count = 30
        scores = []
        f1scores = []
        loss = []
        print(elem)
        for x in range(count):
            X_train, X_test, y_train, y_test = train_test_split(X1, dataset['sentiment'], test_size=0.33)
            
            (model.train(X1, total_examples=len(X), epochs=30))
            X_train_v = []
            for sent in X_train:
                sent_vector = np.mean([model.wv[word] for word in sent if word in model.wv], axis=0)
                X_train_v.append(sent_vector)                
            X_test_v = []
            for sent in X_test:
                sent_vector = np.mean([model.wv[word] for word in sent if word in model.wv], axis=0)
                X_test_v.append(sent_vector)
            mlp = MLPClassifier(max_iter=300)
            (mlp.fit(X_train_v, y_train))
            scores.append(mlp.score(X_test_v, y_test))
            y_pred = mlp.predict(X_test_v)
            f1scores.append(f1_score(y_test, y_pred, average='weighted'))
            loss.append(mlp.loss_)

        avgScore = statistics.mean(scores)
        allScores.append(scores)
        avgF1Score = statistics.mean(f1scores)
        allF1Scores.append(f1scores)

        sdScore = statistics.stdev(scores)
        sdF1Score = statistics.stdev(scores)

        avgScores.append(avgScore)
        avgF1Scores.append(avgF1Score)
        sdScores.append(sdScore)
        sdF1Scoress.append(sdF1Score)

        allLoss.append(loss)
        avgLoss = statistics.mean(loss)
        sdLoss = statistics.stdev(loss)
        print(avgScore,avgF1Score,avgLoss)


        f.write("most used: " + str(elem[0]) + '\n')
        f.write("rate " + str(elem[1]) + '\n\n')

        f.write("Average Score " + str(avgScore) + '\n')
        f.write("Score sd " + str(sdScore) + '\n')
        f.write("All Scores" + str(scores) + '\n\n')

        f.write("Average Loss " + str(avgLoss) + '\n')
        f.write("Loss sd " + str(sdLoss) + '\n')
        f.write("All losses" + str(loss) + '\n\n')

        f.write("Average F1Score " + str(avgF1Score) + '\n')
        f.write("F1Score sd " + str(sdF1Score) + '\n')
        f.write("All F1 Scores" + str(f1scores) + '\n\n\n')




In [None]:
def runWindowAndMinCount(dataset):
    f = open("w2v.txt", "w")
    scoresF = open("pvalues/w2vWindowMinCount/scores.txt", "w")
    f1scoresF = open("pvalues/w2vWindowMinCount/f1Scores.txt", "w")
    lossF = open("pvalues/w2vWindowMinCount/loss.txt", "w")

    minCountAndWindow = []
    for minCount in range (1, 6):
        for window in range (5, 11):
            minCountAndWindow.append([minCount, window])

    avgScores = []
    allScores = []
    sdScores = []
    avgF1Scores = []
    sdF1Scoress = []
    allF1Scores = []
    allLoss = []
    X = [remove_stopwords(sent)  for sent in dataset['review']]
    X = [simple_preprocess(sent, deacc=True) for sent in X]
    for elem in minCountAndWindow:

        model = Word2Vec(X, min_count=elem[0], size=300, window=elem[1])
       
        avgScore = 0
        avgF1Score = 0
        count = 30
        scores = []
        f1scores = []
        loss = []
        print(elem)
        for x in range(count):
            X_train, X_test, y_train, y_test = train_test_split(X, dataset['sentiment'], test_size=0.33)
            
            (model.train(X, total_examples=len(X), epochs=30))
            X_train_v = []
            for sent in X_train:
                sent_vector = np.mean([model.wv[word] for word in sent if word in model.wv], axis=0)
                X_train_v.append(sent_vector)                
            X_test_v = []
            for sent in X_test:
                sent_vector = np.mean([model.wv[word] for word in sent if word in model.wv], axis=0)
                X_test_v.append(sent_vector)
            mlp = MLPClassifier(max_iter=300)
            (mlp.fit(X_train_v, y_train))
            scores.append(mlp.score(X_test_v, y_test))
            y_pred = mlp.predict(X_test_v)
            f1scores.append(f1_score(y_test, y_pred, average='weighted'))
            loss.append(mlp.loss_)

        avgScore = statistics.mean(scores)
        allScores.append(scores)
        avgF1Score = statistics.mean(f1scores)
        allF1Scores.append(f1scores)

        sdScore = statistics.stdev(scores)
        sdF1Score = statistics.stdev(scores)

        avgScores.append(avgScore)
        avgF1Scores.append(avgF1Score)
        sdScores.append(sdScore)
        sdF1Scoress.append(sdF1Score)

        allLoss.append(loss)
        avgLoss = statistics.mean(loss)
        sdLoss = statistics.stdev(loss)

        scoresF.write(str(scores)+'\n')
        lossF.write(str(loss)+'\n')
        f1scoresF.write(str(f1scores)+'\n')

        f.write("min_count: " + str(elem[0]) + '\n')
        f.write("window: " + str(elem[1]) + '\n\n')

        f.write("Average Score " + str(avgScore) + '\n')
        f.write("Score sd " + str(sdScore) + '\n')
        f.write("All Scores" + str(scores) + '\n\n')

        f.write("Average Loss " + str(avgLoss) + '\n')
        f.write("Loss sd " + str(sdLoss) + '\n')
        f.write("All losses" + str(loss) + '\n\n')

        f.write("Average F1Score " + str(avgF1Score) + '\n')
        f.write("F1Score sd " + str(sdF1Score) + '\n')
        f.write("All F1 Scores" + str(f1scores) + '\n\n\n')



In [None]:
runMultipleTimes(dataset)

In [None]:
runWindowAndMinCount(dataset)

In [None]:
X = [remove_stopwords(sent)  for sent in dataset['review']]
X = [simple_preprocess(sent, deacc=True) for sent in X]
X_train, X_test, y_train, y_test = train_test_split(X, dataset['sentiment'], test_size=0.33)

In [None]:
model = Word2Vec(X, min_count=1, size=300, window=10)
model.train(X, total_examples=len(X), epochs=30)

In [None]:
X_train_v = []
for sent in X_train:
    sent_vector = np.mean([model.wv[word] for word in sent if word in model.wv], axis=0)
    X_train_v.append(sent_vector)

    
X_test_v = []
for sent in X_test:
    sent_vector = np.mean([model.wv[word] for word in sent if word in model.wv], axis=0)
    X_test_v.append(sent_vector)


In [None]:
mlp = MLPClassifier(max_iter=300)
mlp.fit(X_train_v, y_train)

In [None]:
mlp.score(X_test_v, y_test)

In [None]:

y_pred = mlp.predict(X_test_v)
f1_score(y_test, y_pred, average='weighted')