# Baseball Win Predictions

In [51]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import csv

# directory containing the CSVs
DATA_PATH = 'Data/sample/'
LABEL_FILE_NAME = 'labelfile.csv'
LABEL_FILE_PATH = join(DATA_PATH, LABEL_FILE_NAME)
MODEL = join(DATA_PATH, 'GoogleNews-vectors-negative300.bin')

### Collecting all CSV files 
##### Format:
tweet text, # retweets, # favorites

##### Notes on the CSV files:
- Each CSV for the series contains 10,000 tweets

In [52]:
files = [{
        'seriesid': f[:-4],
        'data': pd.read_csv(join(DATA_PATH, f), delimiter='\t', names=['tweet','rts','favs']) 
    }
    for f in listdir(DATA_PATH) 
            if isfile(join(DATA_PATH, f)) and f[-3:] == 'csv' and f != LABEL_FILE_NAME
]

for file in files: print(file)

{'seriesid': '0002', 'data':                                            tweet  rts  favs
0    The Yankees are going to kill the Mariners.    3     1
1  The Mariners will always be my favorite team.    1     1
2                     Who cares about baseball??    0     0
3                Yanks will win the series. Bet.    2     1
4                       Yankees will win it all.    4     5}
{'seriesid': '0001', 'data':                                                tweet  rts  favs
0  Hiya!! This is a sample tweet. The Yankees are...    2     0
1               Wow I love the Yankees! Go baseball!    0     0
2                                  The Yankees suck!    1     4
3             Steak n Shake has terrible milkshakes.    0     0
4        The Yankees have no chance against Oakland!    5     2}


### Collecting the label file
##### Format:
series id (CSV File Name), label (0-1)

##### Notes on the label:
- A series consists of multiple games played over the course of several consecutive days. If our team wins, we consider the label to be *1*. If our team loses, we consider the label to be *0*.
- If a series consists of 3 games and our team wins two and loses one, the label is *0.66*.

In [53]:
label_file = pd.read_csv(LABEL_FILE, delimiter='\t', names=['seriesid','label'])

## Next step:
- Create a word vector for each tweet in each series:
    - Each word vector must consider sentiment (Zach)
    - Each word vector must consider retweets and favorites (Meaghan)
- Average these word vectors for an 'overall' sentiment of the series.

### After this:
- Create a numpy array containing each 'overall' sentiment vector for each series. (X)
- Create another numpy array of the same size as the X. This time fill  the array with the labels corresponding to each element (series) in X. (y)

In [54]:
from gensim.models.keyedvectors import KeyedVectors

# load the model to vectorize tweets
model = KeyedVectors.load_word2vec_format(MODEL, binary=True)

In [55]:
# computes the word vector for each word in the sentence.
# averages together all of the word vectors and returns the average
def sentence_to_vector(model, sentence):
    words = [model[w] for w in sentence if w in model]
    if len(words) != 0:
        sum = words[0]
        for i in range(1, len(words)):
            sum = sum + words[i]
        return sum / len(words)
    return None

#### Compute the average of all of the tweet vectors which are the average of each word vector
- This is where we need to take into account retweets and favorites

In [56]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

series_vectors = list()
for file in files:
    tweets_vector = list()
    for tweet in file['data']['tweet']:
        tweets_vector.append(scaler.fit_transform(sentence_to_vector(model, tweet).reshape(-1,1)))
    series_vectors.append({
        'seriesid': file['seriesid'],
        'tweets_vector': tweets_vector
    })

In [58]:
average_series = list()
for series in series_vectors:
    sum = series['tweets_vector'][0]
    for i in range(1, len(series['tweets_vector'])):
        sum = sum + series['tweets_vector'][i]
    average_series.append({
        'seriesid': series['seriesid'],
        'average_vector': sum / len(series['tweets_vector'])
    })

In [68]:
X = [x['average_vector'] for x in sorted(average_series, key=lambda x: x['seriesid'])]

In [91]:
y = list(label_file.sort_values('seriesid')['label'])

In [92]:
len(X)

2

In [93]:
len(y)

2

In [94]:
len(X[0])

300