# Baseball Win Predictions

In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import csv

# directory containing the CSVs
DATA_PATH = 'Data/baseball/yankees/'
LABEL_FILE_NAME = 'labelfile.csv'
LABEL_FILE_PATH = join(DATA_PATH, LABEL_FILE_NAME)
MODEL = join(DATA_PATH, 'GoogleNews-vectors-negative300.bin')

### Collecting all CSV files 
##### Format:
tweet text, # retweets, # favorites

##### Notes on the CSV files:
- Each CSV for the series contains 10,000 tweets

In [2]:
import re

files = list()
for f in listdir(DATA_PATH):
    if isfile(join(DATA_PATH, f)) and f[-3:] == 'tsv':
        with open(join(DATA_PATH, f)) as of:
            files.append({
                'seriesid': f[:-4],
                'text': [re.sub('\s+', ' ', line) for line in of]
            })
            
files[0]

{'seriesid': '1022',
 'text': [' ',
  'I wanna watch my @Yankees nowwww 0 1 ',
  "Grown empathy Let's shouting Angels #6th pic.twitter.com/SceooFixY7 244 703 ",
  'ARMY LOVES YOU, ANGELS . pic.twitter.com/NXEKtFACK4 715 3485 ',
  "Yankees hottest hitter in June as of now: .435 AVG .447 OBP 20 H 3 HR It's good to see that his shoulder injury is no longer a problem! 3 5 ",
  'AVENUE ANGELS A SCAM https:// twitter.com/delaghettomaci /status/874757189180805120 0 0 ',
  'angels https://twitter.com/wildmohn/status/874625800519258112/photo/1 pic.twitter.com/vdWXKmcJIM 1 0 ',
  'Vamos @Yankees @jimenaofficial 0 0 ',
  'Congrats to @j_pearson2 on getting drafted by the @Angels in the 3rd; best example of what hard work and dedication does 1 1 ',
  'Miss these 2 precious angels so much pic.twitter.com/EUvjK4lmNM 0 2 ',
  'Hopefully yes 0 0 ',
  'So proud of my nephew! Got drafted to the N.Y. Yankees today...GOD is good! pic.twitter.com/i2DBregsJ7 0 2 ',
  'AVENUE ANGELS IS A SCAM https:// twitte

### Collecting the label file
##### Format:
series id (CSV File Name), label (0-1)

##### Notes on the label:
- A series consists of multiple games played over the course of several consecutive days. If our team wins, we consider the label to be *1*. If our team loses, we consider the label to be *0*.
- If a series consists of 3 games and our team wins two and loses one, the label is *0.66*.

In [3]:
label_file = pd.read_csv(LABEL_FILE_PATH, delimiter='\t', names=['seriesid','label'])

## Next step:
- Create a word vector for each tweet in each series:
    - Each word vector must consider sentiment (Zach)
    - Each word vector must consider retweets and favorites (Meaghan)
- Average these word vectors for an 'overall' sentiment of the series.

### After this:
- Create a numpy array containing each 'overall' sentiment vector for each series. (X)
- Create another numpy array of the same size as the X. This time fill  the array with the labels corresponding to each element (series) in X. (y)

In [4]:
from gensim.models.keyedvectors import KeyedVectors

# load the model to vectorize tweets
model = KeyedVectors.load_word2vec_format(MODEL, binary=True)

In [5]:
# computes the word vector for each word in the sentence.
# averages together all of the word vectors and returns the average
def sentence_to_vector(model, sentence):
    words = [model[w] for w in sentence if w in model]
    if len(words) != 0:
        sum = words[0]
        for i in range(1, len(words)):
            sum = sum + words[i]
        return sum / len(words)
    return None

#### Compute the average of all of the tweet vectors which are the average of each word vector
- This is where we need to take into account retweets and favorites

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

series_vectors = list()
for file in files:
    tweets_vector = list()
    for tweet in file['text']:
        if tweet != ' ':
            tweets_vector.append(scaler.fit_transform(sentence_to_vector(model, tweet).reshape(-1,1)))
    series_vectors.append({
        'seriesid': file['seriesid'],
        'tweets_vector': tweets_vector
    })

In [7]:
average_series = list()
for series in series_vectors:
    sum = series['tweets_vector'][0]
    for i in range(1, len(series['tweets_vector'])):
        sum = sum + series['tweets_vector'][i]
    average_series.append({
        'seriesid': series['seriesid'],
        'average_vector': sum / len(series['tweets_vector'])
    })

In [8]:
X_data = [x['average_vector'] for x in sorted(average_series, key=lambda x: x['seriesid'])]
y_data = list(label_file.sort_values('seriesid')['label'])

In [9]:
len(X_data)

52

In [10]:
len(y_data)

52