### Data Scraping

Using PRAW to scrape post-match threads from r/gunners

In [1]:
import pandas as pd
import configparser
import praw
import datetime as dt

config = configparser.ConfigParser()
config.read(r"C:\Users\Timothy Lim\Documents\credentials.ini")
preload = True

credentials = config['reddit credentials']

# login
reddit = praw.Reddit(
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    username=credentials['username'],
    password=credentials['password'],
    user_agent=f"testscript by u/{credentials['username']}"
)


# print(reddit.user.me())

In [2]:
# obtain the relevant posts
gunners = reddit.subreddit('gunners')
postMatch = list(gunners.search(query='flair_name:"Post-Match Thread"', sort='new'))

In [3]:
# example post

post = postMatch[0]
created_unix = post.created_utc  # unix timestamp
created_time = dt.datetime.utcfromtimestamp(created_unix)
print(f'utc create unix time: {created_unix}')
print(f'utc time: {created_time}')

print(f'post title: {post.title}')

utc create unix time: 1621788568.0
utc time: 2021-05-23 16:49:28
post title: Post Match Thread: Arsenal 2 - 0 Brighton & Hove Albion [English Premier League]


PRAW Submission documentation: https://praw.readthedocs.io/en/latest/code_overview/models/submission.html?highlight=praw.models.reddit.submission.Submission

In [4]:
# EPL results: https://www.football-data.co.uk/englandm.php

matchResults = pd.read_csv('E0.csv',
                           usecols=['Date', 'HomeTeam', 'AwayTeam',
                                    'FTHG', 'FTAG', 'FTR', 'HTHG', 
                                    'HTAG', 'HTR'])

gunner_mask = (matchResults['HomeTeam']=='Arsenal') | (matchResults['AwayTeam']=='Arsenal')
gunnerResults = matchResults[gunner_mask]

gunnerResults['Date'] = gunnerResults['Date'].apply(lambda x: dt.datetime.strptime(x, '%d/%m/%Y'))
gunnerResults['Date'] = gunnerResults['Date'].apply(lambda x: x.date())

gunnerResults.iloc[-1,:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Date        2021-05-23
HomeTeam       Arsenal
AwayTeam      Brighton
FTHG                 2
FTAG                 0
FTR                  H
HTHG                 0
HTAG                 0
HTR                  D
Name: 370, dtype: object

In [5]:
gunnerResults['Date'].min()

datetime.date(2020, 9, 12)

In [6]:
gunnerResults['Date'].max()

datetime.date(2021, 5, 23)

In [9]:
# post <-> match result mapping
if preload:
    post_df = pd.read_pickle('post_df.pkl')

else:
    post_df = pd.DataFrame()

    # helper variables
    valid_dates = gunnerResults['Date'].unique()
    counter = 0

    for post in postMatch:
        post_date = dt.datetime.utcfromtimestamp(post.created_utc)

        # check if post date matches the epl results from the 20/21 season
        if post_date.date() in valid_dates:
            counter += 1
            row = gunnerResults[gunnerResults['Date']==post_date.date()]
            row['post'] = post
            post_df = post_df.append(row)

            if counter == 38:
                break
        
    post_df.to_pickle('post_df.pkl')

post_df.shape

(36, 10)

In [10]:
# missing posts
found = gunnerResults['Date'].isin(post_df['Date'])
gunnerResults[~found]

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR
40,2020-10-17,Man City,Arsenal,1,0,H,1,0,H
95,2020-11-29,Arsenal,Wolves,1,2,A,1,2,A


In [11]:
# manually get missing posts
missing_ids = ['k3hiw3', 'jd0b73']

for missing_id in missing_ids:
    post = reddit.submission(id=missing_id)
    post_date = dt.datetime.utcfromtimestamp(post.created_utc)
    row = gunnerResults[gunnerResults['Date']==post_date.date()]
    row['post'] = post
    post_df = post_df.append(row)

post_df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(38, 10)

In [12]:
post_df.sort_values(by=['Date'], inplace=True)
post_df.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,post
0,2020-09-12,Fulham,Arsenal,0,3,A,0,1,A,ircjx0
11,2020-09-19,Arsenal,West Ham,2,1,H,1,1,D,iw0l0p
27,2020-09-28,Liverpool,Arsenal,3,1,H,2,1,H,j1knlt
34,2020-10-04,Arsenal,Sheffield United,2,1,H,0,0,D,j505r8
40,2020-10-17,Man City,Arsenal,1,0,H,1,0,H,jd0b73


In [13]:
# create comments dataframe
from progressbar import ProgressBar

comments_df = pd.DataFrame()
pbar = ProgressBar()

if preload:
    comments_df = pd.read_pickle('comments_df.pkl')
    
else:
    for post in pbar(post_df['post']):
        post_comments = post.comments  # returns a CommentForest

        for comment in post_comments:

            try:
                row = {
                    'author': comment.author,
                    'body': comment.body,
                    'n_replies': len(comment.replies),
                    'score': comment.score,
                    'comment': comment,
                    'post': post
                }

                comments_df = comments_df.append(pd.Series(row), ignore_index=True)

            except AttributeError:
                # MoreComments
                continue

    comments_df.to_pickle('comments_df.pkl')
    
print(comments_df.shape)

(11160, 6)


In [14]:
comments_df.head()

Unnamed: 0,author,body,comment,n_replies,post,score
0,akpommed,Elneny at the end picking it up with 3 attacke...,g4xe3hb,2.0,ircjx0,330.0
1,suxer,**LOVE** to finally see **Willian** in... blue...,g4xe3lb,2.0,ircjx0,276.0
2,J4ckrh,"Both our debutants were brilliant, Willian man...",g4xe1qp,7.0,ircjx0,500.0
3,dannywelbad,You know Tierney is a great player when no-one...,g4xibyq,2.0,ircjx0,231.0
4,craigizard,Have a great weekend gunners,g4xe1wd,3.0,ircjx0,649.0


### EDA 

### Sentiment Analysis with VADER