# Acquire data

This notebook loads all of the csv files and saves them to a single pandas dataframe

In [1]:
import pandas as pd
import os
from datetime import datetime as dt
import re
from functools import reduce

## Load tweets at female MPs

In [2]:
df_tweets = pd.DataFrame()

path = '../data/raw/'

for filename in os.listdir('../data/raw'):
    if filename[-4:] == '.csv':
        df_tweets = df_tweets.append(pd.read_csv(path + filename, encoding='latin1'))

In [3]:
df_tweets.dropna(subset=['Date (GMT)'], inplace=True)

df_tweets['Date (GMT)'] = df_tweets['Date (GMT)'].map(lambda x : dt.strptime(x, '%d/%m/%Y %H:%M'))
df_tweets = (df_tweets.sort_values('Date (GMT)', ascending=False)
                      .drop_duplicates(['GUID', 'URL'])
                      .reset_index(drop=True))

names = {'GUID': 'guid', 'Date (GMT)': 'date', 'URL': 'url', 'Contents': 'tweet', 'Author': 'author', 
         'Name': 'name', 'County': 'country', 'State/Region': 'region', 'City/Urban Area': 'city', 
         'Category': 'category', 'Emotion': 'emotion', 'Source': 'source', 'Klout Score': 'klout', 
         'Gender': 'gender', 'Posts': 'posts', 'Followers': 'followers', 'Following': 'following'}
df_tweets.rename(columns=names, inplace=True)

### Extract handles and hashtags for quick searching

In [4]:
def find_handles(text):
    r = re.compile('@[^\s]+')
    return re.findall(r, text)

def find_hashtags(text):
    r = re.compile('#[^\s]+')
    return re.findall(r, text)

In [5]:
df_tweets['handles'] = df_tweets['tweet'].map(find_handles)
df_tweets['hashtags'] = df_tweets['tweet'].map(find_hashtags)

df_tweets.to_pickle('../data/interim/tweets.pkl')

## Load female MPs

In [6]:
# helper function for quickly matching MPs to mentions
def make_gen(key, series):
    for i, keys in series.iteritems():
        if key in keys:
            yield i

In [7]:
df_mps = pd.read_excel('../data/raw/Hackathon_WomenMP.xlsx')
df_mps.dropna(subset=['twitter_username'], inplace=True)

df_mps['twitter_username'] = df_mps['twitter_username'].map(lambda x: '@' + x, na_action='ignore')
df_mps['mentions'] = df_mps['twitter_username'].map(lambda x: [i for i in make_gen(x, df_tweets['handles'])])

names = {'Ethnic minority':'ethnicity', 'Religion minority': 'religion',
         'Age/ DOB':'dob'}
df_mps.rename(columns=names, inplace=True)
df_mps['no_tweets'] = df_mps['mentions'].map(lambda x: len(x))

df_mps.to_pickle('../data/interim/mps.pkl')

## Extract hashtags

In [8]:
hashtags = {}

for i, tags in df_tweets['hashtags'].iteritems():
    for tag in tags:
        hashtags[tag] = hashtags.setdefault(tag, []) + [i]

d = {'hashtag': [k for k in hashtags.keys()], 'mentions': [v for v in hashtags.values()]}
df_hashtags = pd.DataFrame(d)

df_hashtags.to_pickle('../data/interim/hashtags.pkl')

## Load MP tweets

In [28]:
df_mptweets = pd.read_pickle('../data/raw/FemaleMPTweets.pkl')
names = {'politician_name': 'name', 'politician_username': 'handle'}
df_mptweets.rename(columns=names, inplace=True)
df_mptweets['date'] = (df_mptweets[['tweet_year', 'tweet_month', 'tweet_day', 'tweet_hour']]
                       .apply(lambda x: dt(x.tweet_year, x.tweet_month, x.tweet_day, x.tweet_hour), axis=1))
df_mptweets.drop(['tweet_year', 'tweet_month', 'tweet_day', 'tweet_hour'], axis=1, inplace=True)

df_mptweets.to_pickle('../data/interim/mptweets.pkl')

## Extract tweeters?

It might be useful to create a list of twitter users that have tweeted MPs a lot, but I won't implement this for now.