# Baseball Win Predictions

In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import csv
import gameSentiment as gs

# directory containing the CSVs
DATA_PATH = 'Data/baseball/yankees/'
LABEL_FILE_NAME = 'labelfile.csv'
LABEL_FILE_PATH = join(DATA_PATH, LABEL_FILE_NAME)
MODEL = join(DATA_PATH, 'GoogleNews-vectors-negative300.bin')

### Collecting all CSV files 
##### Format:
tweet text, # retweets, # favorites

##### Notes on the CSV files:
- Each CSV for the series contains 10,000 tweets

In [2]:
import re

files = list()
for f in listdir(DATA_PATH):
    if isfile(join(DATA_PATH, f)) and f[-3:] == 'tsv':
        with open(join(DATA_PATH, f)) as of:
            files.append({
                'seriesid': f[:-4],
                'text': [re.sub('\s+', ' ', line) for line in of]
            })

### Collecting the label file
##### Format:
series id (CSV File Name), label (0-1)

##### Notes on the label:
- A series consists of multiple games played over the course of several consecutive days. If our team wins, we consider the label to be *1*. If our team loses, we consider the label to be *0*.
- If a series consists of 3 games and our team wins two and loses one, the label is *0.66*.

In [3]:
label_file = pd.read_csv(LABEL_FILE_PATH, delimiter='\t', names=['seriesid','label'])

## Next step:
- Create a word vector for each tweet in each series:
    - Each word vector must consider sentiment (Zach)
    - Each word vector must consider retweets and favorites (Meaghan)
- Average these word vectors for an 'overall' sentiment of the series.

### After this:
- Create a numpy array containing each 'overall' sentiment vector for each series. (X)
- Create another numpy array of the same size as the X. This time fill  the array with the labels corresponding to each element (series) in X. (y)

In [4]:
from gensim.models.keyedvectors import KeyedVectors

# load the model to vectorize tweets
model = KeyedVectors.load_word2vec_format(MODEL, binary=True)

In [5]:
# computes the word vector for each word in the sentence.
# averages together all of the word vectors and returns the average
def sentence_to_vector(model, sentence):
    words = [model[w] for w in sentence if w in model]
    if len(words) != 0:
        sum = words[0]
        for i in range(1, len(words)):
            sum = sum + words[i]
        return sum / len(words)
    return None

#### Compute the average of all of the tweet vectors which are the average of each word vector
- This is where we need to take into account retweets and favorites

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

series_vectors = list()
for file in files:
    tweets_vector = list()
    for tweet in file['text']:
        if tweet != ' ':
            tweets_vector.append(scaler.fit_transform(sentence_to_vector(model, tweet).reshape(-1,1)) * gs.getSentiment(tweet))
    series_vectors.append({
        'seriesid': file['seriesid'],
        'tweets_vector': tweets_vector
    })

In [7]:
average_series = list()
for series in series_vectors:
    sum = series['tweets_vector'][0]
    for i in range(1, len(series['tweets_vector'])):
        sum = sum + series['tweets_vector'][i]
    average_series.append({
        'seriesid': series['seriesid'],
        'average_vector': sum / len(series['tweets_vector'])
    })

#### Creating X

In [8]:
temp = [x['average_vector'] for x in sorted(average_series, key=lambda x: x['seriesid'])]
X = list()
for x in temp:
    a = list()
    for item in x:
        a.append(item[0])
    X.append(a)
len(X[0])

300

#### Creating y

In [9]:
y = list(label_file.sort_values('seriesid')['label'])

In [14]:
len(X)

52

In [15]:
len(y)

52

### Now we need to split the 52 series into *training* and *test* sets

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.linear_model import SGDClassifier
import numpy as np

In [18]:
clf = SGDClassifier()
clf.fit(X_train, np.asarray(y_train, dtype='|S9'))



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [19]:
clf.predict(X_test)

array([b'0.3333333', b'0.6666666', b'0.6666666', b'0.3333333',
       b'0.3333333', b'0.3333333', b'0.6666666', b'0.6666666',
       b'0.3333333', b'0.3333333', b'0.3333333'], 
      dtype='|S12')

In [21]:
y_test

[1.0,
 0.0,
 1.0,
 0.6666666667,
 0.6666666667,
 0.3333333333,
 0.5,
 0.6666666667,
 1.0,
 0.75,
 0.3333333333]

In [22]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, np.asarray(y_train, dtype='|S9'))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [23]:
clf.predict(X_test)

array([b'0.6666666', b'0.6666666', b'0.3333333', b'0.75', b'1.0',
       b'0.3333333', b'0.6666666', b'0.5', b'0.3333333', b'0.3333333',
       b'0.3333333'], 
      dtype='|S9')

In [24]:
y_test

[1.0,
 0.0,
 1.0,
 0.6666666667,
 0.6666666667,
 0.3333333333,
 0.5,
 0.6666666667,
 1.0,
 0.75,
 0.3333333333]