In [7]:
import sys
sys.path.append('C:\Anaconda3\Lib\site-packages')

import json
import numpy as np
import datetime as dt
import glob
import os
import pandas as pd

# For loading tweets to generators
from itertools import chain

% matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl

### Read tweets into notebook
The name of the game here will be to load tweets with minimum memory usage. My plan is to store the tweets (saved in .json format) in a generator. Then iterate through the object and only store desired information from the tweet. I want to store the following:

- text
- date created
- retweet or original
- user
- number of likes
- number of retweets
- number of user followers
- number of user following

#### Analysis outlook

It might be interesting to try and predict things like number of likes based on the last three features. By converting the number of likes into catagories e.g. none (0), low (1-5), moderate(6-15), high(16-100), famous(100+) we are open to a range of machine learning algorithms such as kNN, binary search tree, OvR (one vs. rest) linear models e.g. linear regression and logistic regression. Should choose categories such that each is well represented.

#### Read tweets the lazy way with large memory cost

In [215]:
# # Use glob to search through all dates with specific hashtag
# # then read the files and load the cumulitive result into a
# # list to return

# def merge_tweets(file_root, skip=100):
#     ''' Function that compiles tweets from multiple files
#         into a single list.  This may take a while.
        
#         file_root - Root directory to folder
#         skip (int) - Number of files to skip over before storing
#                      a tweet to memory.
#         '''
#     print(list(glob.iglob(file_root+'*')))
#     tweet_files = list(glob.iglob(file_root+'*'))
#     tweets = []
#     for file in tweet_files:
#         with open(file, 'r') as f:
#             for i, line in enumerate(f.readlines()):
#                     if i%skip == 0:
#                         tweets.append(json.loads(line))
#         print('finished importing file:', file)
#     return tweets
#
# data = merge_tweets(file_root='#nhl/', skip=100)

In [210]:
# Use glob to search through all dates with specific hashtag
# then read the files and load the cumulitive result into generator

def merge_tweets(file_root, skip=100, file_start=''):
    ''' Function that compiles tweets from multiple files
        into a single list.  This may take a while.
        
        file_root (str) - Root directory to folder
        skip (int)      - To save memory, skip over
                          files using the rule:
                          if i%skip == 0. e.g. skip=1
                          reads in all tweets because
                          i%1 == 0 for all integers i. '''
    
    if not file_start:
        file_start = file_root
    file_root = file_root + '/' + file_start + '*'
    tweet_files = list(glob.iglob(file_root))
    tweets = iter(())
    for f in tweet_files:
        t = load_tweets(f, skip)
        tweets = chain(tweets, t)
    return tweets

def load_tweets(file, skip):
    with open(file, 'r') as f:
        tweets = (json.loads(line) for i, line in enumerate(f.readlines()) if i%skip==0)
    return tweets
    
# Put tweets into a dictionary
all_tweets = {}

# Input folder names
search_phrases = ['test_files']

for folder in search_phrases:
    all_tweets[folder] = merge_tweets(file_root=folder, skip=100)

In [208]:
all_tweets

{'test_files': <itertools.chain at 0x206a7df0668>}

In [211]:
# Iterate through tweets, check for NHL related
# phrases in tweet['text'] and save qualifying
# tweets to a new file

criteria = {'#nhl': ['nhl'],
            'test_files': ['nhl'],
            'Pavelski': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'SJS', 'sjs', 'sharks', 'Sharks',
                         'jose', 'Jose', 'Joe'],
            'Lucic': ['NHL', 'nhl', 'hockey', 'Hockey',
                      'LAK', 'kings', 'Kings',
                      'angeles', 'Angeles', 'Milan'],
            'Ovechkin': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'WSH', 'wsh', 'caps', 'Caps',
                         'capitals', 'Capitals',
                         'washington', 'Washington',
                         'Alex'],
            'Giroux': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'PHI', 'phi', 'flyers', 'Flyers',
                       'Philadelphia', 'Claude'],
            'Jagr': ['NHL', 'nhl', 'hockey', 'Hockey',
                     'FLA', 'fla', 'panthers', 'Panthers',
                     'florida', 'Florida', 'Jaromir'],
            'Tavares': ['NHL', 'nhl', 'hockey', 'Hockey',
                        'NYI', 'nyi', 'islanders', 'Islanders',
                        'york', 'York', 'John'],
            'Kucherov': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'TBL', 'tbl', 'lightning', 'Lightning',
                         'tampa', 'Tampa', 'Nikita'],
            'Mrazek': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'DET', 'det', 'Wings', 'wings',
                       'Detroit', 'Petr'],
            'Seguin': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'DAL', 'dal', 'stars', 'Stars',
                       'Dallas', 'Tyler'],
            'Pominville': ['NHL', 'nhl', 'hockey', 'Hockey',
                           'MIN', 'min', 'wild', 'Wild',
                           'Minnesota', 'Jason'],
            'Crosby': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'PIT', 'pit', 'penguins', 'Penguins',
                       'Pittsburgh', 'Sidney'],
            'Lundqvist': ['NHL', 'nhl', 'hockey', 'Hockey',
                          'NYR', 'nyr', 'rangers', 'Rangers',
                          'york', 'York', 'Henrik'],
            'Tarasenko':['NHL', 'nhl', 'hockey', 'Hockey',
                         'STL', 'stl', 'blues', 'Blues',
                         'louis', 'Louis', 'Vladimir'],
            'Kane': ['NHL', 'nhl', 'hockey', 'Hockey',
                     'CHI', 'chi', 'hawks', 'Hawks',
                     'chicago', 'Chicago', 'Patrick'],
            'Perry': ['NHL', 'nhl', 'hockey', 'Hockey',
                      'ANA', 'ana', 'ducks', 'Ducks',
                      'Anaheim', 'Corey'],
            'Forsberg': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'NSH', 'nsh', 'predators', 'Predators',
                         'Nashville', 'Filip']}

if True:
    for folder in all_tweets.keys():
        with open(folder+'/filtered_tweets.json', 'w') as f:
            for t in all_tweets[folder]:
                for word in criteria[folder]:
                    if word in t['text']:
                        json.dump(t, f)
                        f.write('\n')

In [220]:
# Read in filtered tweets

# Put tweets into a dictionary
all_tweets = {}

# Input folder names
search_phrases = ['test_files']

for folder in search_phrases:
    all_tweets[folder] = merge_tweets(file_root=folder, skip=1,
                                      file_start='filtered_tweets')

In [221]:
# Iterate over generators containing tweets and
# append desired information to lists

data = {'text': [], 'screen_name': [], 'created_at': [],
        'retweet_count': [], 'favorite_count': [],
        'friends_count': [], 'followers_count': []}

for folder in all_tweets.keys():
    for t in all_tweets[folder]:
        data['text'].append(t['text'])
        data['screen_name'].append(t['user']['screen_name'])
        data['created_at'].append(t['created_at'])
        data['retweet_count'].append(t['retweet_count'])
        data['favorite_count'].append(t['favorite_count'])
        data['friends_count'].append(t['user']['friends_count'])
        data['followers_count'].append(t['user']['followers_count'])

In [241]:
# Create pandas dataframe from dictionary

df = pd.DataFrame(data)

In [242]:
# Add column for retweet or original

RT = []
for t in df.text:
    RT.append(t.split()[0]=='RT')
df['RT'] = RT

# Convert created_at to datetimes

df['created_at'] = pd.to_datetime(df['created_at'])

df.head()

Unnamed: 0,created_at,favorite_count,followers_count,friends_count,retweet_count,screen_name,text,RT
0,2016-04-17 23:33:28,0,191,1536,0,WilliamWisson,"Hitchcock on round 1 series: ""This feels very ...",False
1,2016-04-17 23:04:29,0,47,148,103,jenlbyrnes,RT @martinkilcoyne2: Now that's a big screen t...,True
2,2016-04-17 22:46:33,0,110,101,103,elope2003,RT @martinkilcoyne2: Now that's a big screen t...,True
3,2016-04-17 21:45:24,0,52,363,0,tschatsiek,Capitalize!!!!! #stlblues #nhl #StanleyCupPlay...,False
4,2016-04-17 21:17:59,0,191,1536,0,WilliamWisson,Islanders must fix defensive problems on secon...,False


In [243]:
df.dtypes

created_at         datetime64[ns]
favorite_count              int64
followers_count             int64
friends_count               int64
retweet_count               int64
screen_name                object
text                       object
RT                           bool
dtype: object

In [239]:
df.describe()

Unnamed: 0,favorite_count,followers_count,friends_count,retweet_count,RT
count,29.0,29.0,29.0,29.0,29
mean,0.206897,559.793103,887.275862,11.689655,0.172414
std,0.619868,1026.912376,864.469651,34.667918,0.384426
min,0.0,30.0,25.0,0.0,False
25%,0.0,110.0,144.0,0.0,0
50%,0.0,191.0,706.0,0.0,0
75%,0.0,598.0,1536.0,0.0,0
max,3.0,5376.0,3909.0,128.0,True
