In [7]:
import sys
sys.path.append('C:\Anaconda3\Lib\site-packages')

import json
import numpy as np
import datetime as dt
import glob
import os
import pandas as pd

# For loading tweets to generators
from itertools import chain

% matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl

### Read tweets into notebook
The name of the game here will be to load tweets with minimum memory usage. My plan is to store the tweets (saved in .json format) in a generator. Then iterate through the object and only store desired information from the tweet. I want to store the following:

- text
- date created
- retweet or original
- user
- number of likes
- number of retweets
- number of user followers
- number of user following

#### Analysis outlook

It might be interesting to try and predict things like number of likes based on the last three features. By converting the number of likes into catagories e.g. none (0), low (1-5), moderate(6-15), high(16-100), famous(100+) we are open to a range of machine learning algorithms such as kNN, binary search tree, OvR (one vs. rest) linear models e.g. linear regression and logistic regression. Should choose categories such that each is well represented.

In [210]:
# Use glob to search through all dates with specific hashtag
# then read the files and load the cumulitive result into generator

def merge_tweets(file_root, skip=100, file_start=''):
    ''' Function that compiles tweets from multiple files
        into a single list.  This may take a while.
        
        file_root (str) - Root directory to folder
        skip (int)      - To save memory, skip over
                          files using the rule:
                          if i%skip == 0. e.g. skip=1
                          reads in all tweets because
                          i%1 == 0 for all integers i. '''
    
    if not file_start:
        file_start = file_root
    file_root = file_root + '/' + file_start + '*'
    tweet_files = list(glob.iglob(file_root))
    tweets = iter(())
    for f in tweet_files:
        t = load_tweets(f, skip)
        tweets = chain(tweets, t)
    return tweets

def load_tweets(file, skip):
    with open(file, 'r') as f:
        tweets = (json.loads(line) for i, line in enumerate(f.readlines()) if i%skip==0)
    return tweets
    
# Put tweets into a dictionary
all_tweets = {}

# Input folder names
search_phrases = ['test_files']

for folder in search_phrases:
    all_tweets[folder] = merge_tweets(file_root=folder, skip=100)

In [208]:
all_tweets

{'test_files': <itertools.chain at 0x206a7df0668>}

In [211]:
# Iterate through tweets, check for NHL related
# phrases in tweet['text'] and save qualifying
# tweets to a new file

criteria = {'#nhl': ['nhl'],
            'test_files': ['nhl'],
            'Pavelski': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'SJS', 'sjs', 'sharks', 'Sharks',
                         'jose', 'Jose', 'Joe'],
            'Lucic': ['NHL', 'nhl', 'hockey', 'Hockey',
                      'LAK', 'kings', 'Kings',
                      'angeles', 'Angeles', 'Milan'],
            'Ovechkin': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'WSH', 'wsh', 'caps', 'Caps',
                         'capitals', 'Capitals',
                         'washington', 'Washington',
                         'Alex'],
            'Giroux': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'PHI', 'phi', 'flyers', 'Flyers',
                       'Philadelphia', 'Claude'],
            'Jagr': ['NHL', 'nhl', 'hockey', 'Hockey',
                     'FLA', 'fla', 'panthers', 'Panthers',
                     'florida', 'Florida', 'Jaromir'],
            'Tavares': ['NHL', 'nhl', 'hockey', 'Hockey',
                        'NYI', 'nyi', 'islanders', 'Islanders',
                        'york', 'York', 'John'],
            'Kucherov': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'TBL', 'tbl', 'lightning', 'Lightning',
                         'tampa', 'Tampa', 'Nikita'],
            'Mrazek': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'DET', 'det', 'Wings', 'wings',
                       'Detroit', 'Petr'],
            'Seguin': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'DAL', 'dal', 'stars', 'Stars',
                       'Dallas', 'Tyler'],
            'Pominville': ['NHL', 'nhl', 'hockey', 'Hockey',
                           'MIN', 'min', 'wild', 'Wild',
                           'Minnesota', 'Jason'],
            'Crosby': ['NHL', 'nhl', 'hockey', 'Hockey',
                       'PIT', 'pit', 'penguins', 'Penguins',
                       'Pittsburgh', 'Sidney'],
            'Lundqvist': ['NHL', 'nhl', 'hockey', 'Hockey',
                          'NYR', 'nyr', 'rangers', 'Rangers',
                          'york', 'York', 'Henrik'],
            'Tarasenko':['NHL', 'nhl', 'hockey', 'Hockey',
                         'STL', 'stl', 'blues', 'Blues',
                         'louis', 'Louis', 'Vladimir'],
            'Kane': ['NHL', 'nhl', 'hockey', 'Hockey',
                     'CHI', 'chi', 'hawks', 'Hawks',
                     'chicago', 'Chicago', 'Patrick'],
            'Perry': ['NHL', 'nhl', 'hockey', 'Hockey',
                      'ANA', 'ana', 'ducks', 'Ducks',
                      'Anaheim', 'Corey'],
            'Forsberg': ['NHL', 'nhl', 'hockey', 'Hockey',
                         'NSH', 'nsh', 'predators', 'Predators',
                         'Nashville', 'Filip']}

if True:
    for folder in all_tweets.keys():
        with open(folder+'/filtered_tweets.json', 'w') as f:
            for t in all_tweets[folder]:
                for word in criteria[folder]:
                    if word in t['text']:
                        json.dump(t, f)
                        f.write('\n')

In [212]:
# Read in filtered tweets

# Put tweets into a dictionary
all_tweets = {}

# Input folder names
search_phrases = ['test_files']

for folder in search_phrases:
    all_tweets[folder] = merge_tweets(file_root=folder, skip=1,
                                      file_start='filtered_tweets')

### Read tweets the lazy way with large memory cost

In [215]:
# # Use glob to search through all dates with specific hashtag
# # then read the files and load the cumulitive result into a
# # list to return

# def merge_tweets(file_root, skip=100):
#     ''' Function that compiles tweets from multiple files
#         into a single list.  This may take a while.
        
#         file_root - Root directory to folder
#         skip (int) - Number of files to skip over before storing
#                      a tweet to memory.
#         '''
#     print(list(glob.iglob(file_root+'*')))
#     tweet_files = list(glob.iglob(file_root+'*'))
#     tweets = []
#     for file in tweet_files:
#         with open(file, 'r') as f:
#             for i, line in enumerate(f.readlines()):
#                     if i%skip == 0:
#                         tweets.append(json.loads(line))
#         print('finished importing file:', file)
#     return tweets

In [216]:
# data = merge_tweets(file_root='#nhl/', skip=100)

In [217]:
# ''' Test cell '''
    
# for i, d in enumerate(data[::-1]):
#     print(d['retweet_count'])
#     print(d['favorite_count'])
#     print(d['text'])
#     print(d['user']['friends_count'])
#     print(d['user']['followers_count'])
#     print(d['user']['screen_name'])
#     print(d['created_at'])
#     print('')
#     if i==0:
#         break

In [None]:
def populate_tweet_df(tweets):
    df = pd.DataFrame()
    
    df['user'] = list(map(lambda tweet: tweet['user']['screen_name'], tweets))

    df['text'] = list(map(lambda tweet: tweet['text'], tweets))
    
    df['created_at'] = list(map(lambda tweet: tweet['created_at'], tweets))
    
    
    
#    df['location'] = list(map(lambda tweet: tweet['user']['location'], tweets))   
#    df['country_code'] = list(map(lambda tweet: tweet['place']['country_code']
#                                  if tweet['place'] != None else '', tweets))
#    df['long'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][0]
#                        if tweet['coordinates'] != None else 'NaN', tweets))   
#    df['latt'] = list(map(lambda tweet: tweet['coordinates']['coordinates'][1]
#                        if tweet['coordinates'] != None else 'NaN', tweets))

    return df