# MSR 2018 Challenge Notebook - Use Data
This notebook contains the work done for the prediction for Tyson Bulmer's MSR 2018 Challenge paper using the supplied dataset.

In [None]:
import pandas as pd
import numpy as np

import math

import matplotlib.pyplot as plt

from tqdm import tqdm


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

## Read in formated data

In [None]:
event_data = pd.read_csv('data.csv')
# Filter out unknown buton command
event_data = event_data[event_data['command'] != 'CommandEvent-unknown.button']

In [None]:
counts = event_data['command'].value_counts()
counts

In [None]:
import math
plt.plot(list(counts.cumsum() / sum(counts)))
plt.xlabel('Number of events')
plt.ylabel('Cumulative sum')
plt.title('Cumulative sum of different command events')
plt.savefig('cumulativesum.png')
plt.show()

In [None]:
event_data = event_data[event_data['command'].isin(counts[(counts.cumsum() / sum(counts)) < .9].index)].sample(100000)

## Predict across different N-Gram ranges

In [None]:
ngram_combos = [
    (1,1),
    (1,2),
    (1,3),
    (1,4)
]

In [None]:
d = {}
for combo in ngram_combos:
    X, y = event_data['events'], event_data['command']
    X = CountVectorizer(ngram_range=combo).fit_transform(X).toarray()
    model = MultinomialNB()
    print(X.shape)
    d[combo] = sum(cross_val_score(model, X, y, cv=10)) / 10

### Try other models

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression

In [None]:
d_1 = {}
k = {'GaussianNB': GaussianNB, 'BernoulliNB': BernoulliNB,'LogisticRegression':LogisticRegression }
for name, model_t in k.items():
    print(name)
    for combo in ngram_combos:
        X, y = event_data['events'], event_data['command']
        X = CountVectorizer(ngram_range=combo).fit_transform(X).toarray()
        model = model_t()
        d_1[combo] = sum(cross_val_score(model, X, y, cv=10)) / 10
    print(d_1)