In [1]:
import numpy as np
import matplotlib as plt
from os import listdir
from re import sub
from sklearn import model_selection

In [2]:
data_dir = 'dataset'
N = 20
test_size = 0.2
n_splits = 10

In [3]:
def compute_accuracy(Y_, Y):
    num_correct = sum([y_ == y for y_, y in zip(Y_, Y)])
    num_total = len(Y_)
    
    return num_correct / num_total

In [4]:
def find_most_frequent(paths, N):
    dictionary = {}
    
    for path in paths:
        with open(path, 'r', encoding = 'latin-1') as file:
            for line in file:
                for word in line.split(' '):
                    clean_word = sub(r'\W+', '', word).lower()
                    
                    if clean_word not in dictionary:
                        dictionary[clean_word] = 0
                    
                    dictionary[clean_word] += 1
    
    by_frequency = sorted(dictionary.items(), key=lambda x: x[1])
    most_frequent = map(lambda x: x[0], by_frequency[-N:])
    return list(most_frequent)

In [5]:
def calculate_features(paths, most_frequent):
    X = []
    Y_ = []
    
    N = len(most_frequent)
    for path in paths:
        label = path.split('_')[-1]
        Y_.append(label)
        
        total = 0
        features = [0] * N
        
        with open(path, 'r', encoding = 'latin-1') as file:
            for line in file:
                for word in line.split(' '):
                    total += 1
                    clean_word = sub(r'\W+', '', word).lower()
                    
                    if clean_word not in most_frequent:
                        continue
                    
                    features[most_frequent.index(clean_word)] += 1
        
        # features = [f / total for f in features]
        X.append(features)
    
    return np.array(X), np.array(Y_)

In [6]:
data_paths = [data_dir + '/' + file for file in listdir(data_dir)]

most_frequent = find_most_frequent(data_paths, N)
X, Y_ = calculate_features(data_paths, most_frequent)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y_, test_size=test_size)

In [7]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

model = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)

model.fit(X_train, Y_train)
Y = model.predict(X_test)
print(compute_accuracy(Y_test, Y))

0.7063492063492064


In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10)

model.fit(X_train, Y_train)
Y = model.predict(X_test)
print(compute_accuracy(Y_test, Y))

0.7261904761904762


In [9]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(n_estimators=100)

model.fit(X_train, Y_train)
Y = model.predict(X_test)
print(compute_accuracy(Y_test, Y))

0.7420634920634921


In [10]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X_train, Y_train)
Y = model.predict(X_test)
print(compute_accuracy(Y_test, Y))

0.6190476190476191


In [11]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train, Y_train)
Y = model.predict(X_test)
print(compute_accuracy(Y_test, Y))

0.6547619047619048


In [12]:
from sklearn.svm import SVC

model = SVC()

model.fit(X_train, Y_train)
Y = model.predict(X_test)
print(compute_accuracy(Y_test, Y))

0.44841269841269843


In [13]:
from sklearn.svm import LinearSVC

model = LinearSVC()

model.fit(X_train, Y_train)
Y = model.predict(X_test)
print(compute_accuracy(Y_test, Y))

0.5873015873015873


In [14]:
def fit_and_compute_accuracy(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    Y = model.predict(X_test)
    return compute_accuracy(Y_test, Y)

In [16]:
from sklearn.model_selection import KFold

accuracies = []

kf = KFold(n_splits=n_splits)
for train, test in kf.split(X, Y_):
    model = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
    accuracy = fit_and_compute_accuracy(model, X[train], Y_[train], X[test], Y_[test])
    accuracies.append(accuracy)

print(sum(accuracies) / n_splits)

0.7777777777777777
