In [1]:
import re
import operator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from glob import glob

In [2]:
filenames = glob('data/full_who_dataset*.csv')
dataframes = [pd.read_csv(open(f, 'rb')) for f in filenames]
data = pd.concat(dataframes)

In [3]:
data = data.drop(columns=['Unnamed: 0', 'level_0', 'index', ' '])
data.reset_index(drop=True, inplace=True)

Extract tweets from/about organisations from list of organisations

In [4]:
data_a11 = pd.read_csv('data/org/a11.tsv', delimiter='\t')
data_a12 = pd.read_csv('data/org/a12.tsv', delimiter='\t')
data_a14 = pd.read_csv('data/org/a14.tsv', delimiter='\t')
#data_cat = pd.read_csv('data/org/cat.tsv', delimiter='\t')
#data_brazil = pd.read_csv('data/org/brazil.tsv', delimiter='\t')
data_brazil = pd.read_csv('data/org/brazil_gov.tsv', delimiter='\t')

In [5]:
def org_extraction(dataframe):
    for x in range(len(dataframe)):
        org = []
        org_mention = dataframe.loc[x]['account']
        org_account = re.sub('@', '', org_mention)
        org_local_name = dataframe.loc[x]['local_name']
        org_en_name = dataframe.loc[x]['english_name']

        for num, row in enumerate(data['text']):
            if type(org_mention) == str:
                if re.findall(org_mention, str(row)):
                    org.append(data.loc[num]) # ['text'])
            elif type(org_local_name) == str:
                if re.findall(org_local_name, str(row)):
                    org.append(data.loc[num]) #['text'])
            elif type(org_en_name) == str:
                if re.findall(org_en_name, str(row)):
                    org.append(data.loc[num]) #['text'])
        
        for num, row in enumerate(data['from_user']):
            if org_account == row:
                org.append(data.loc[num]) #['text'])
                
        
        org_data = pd.DataFrame(org)
        print(len(org_data))
        org_data.drop_duplicates(subset ="id_str", keep = False, inplace = True)
        org_data.drop_duplicates(subset ="text", keep = False, inplace = True)
        org_data.drop_duplicates(keep = False, inplace = True)
        org_data = org_data.reset_index(drop=True)
        org_data.to_csv('data/out/cat/cat_'+ org_account +'.csv', sep='\t')

In [6]:
def brazil_extraction(dataframe):
    brazil_dataset = pd.DataFrame()
    for x in range(len(dataframe)):
        org = []
        org_mention = dataframe.loc[x]['account']
        org_account = re.sub('@', '', org_mention)
        org_local_name = dataframe.loc[x]['local_name']
        org_en_name = dataframe.loc[x]['english_name']

        for num, row in enumerate(data['text']):
            if type(org_mention) == str:
                if re.findall(org_mention, str(row)):
                    org.append(data.loc[num]) # ['text'])
            elif type(org_local_name) == str:
                if re.findall(org_local_name, str(row)):
                    org.append(data.loc[num]) #['text'])
            elif type(org_en_name) == str:
                if re.findall(org_en_name, str(row)):
                    org.append(data.loc[num]) #['text'])
        
        for num, row in enumerate(data['from_user']):
            if org_account == row:
                org.append(data.loc[num]) #['text'])
                
        
        org_data = pd.DataFrame(org)
        org_data.drop_duplicates(subset ="id_str", keep = False, inplace = True)
        org_data.drop_duplicates(subset ="text", keep = False, inplace = True)
        org_data.drop_duplicates(keep = False, inplace = True)
        org_data = org_data.reset_index(drop=True)
        print(len(org_data))
        brazil_dataset = pd.concat([brazil_dataset, org_data])
        
        
    brazil_dataset.to_csv('data/out/brazil_gov.csv', sep='\t')

In [7]:
#org_extraction(data_cat)
brazil_extraction(data_brazil)

0
179
105
3


Printing timeline graphs

In [8]:
def save_plot(data, g_step, name, filename):
    graph = {}
    for t in data:
        if t not in graph:
            graph[t] = 1
        else:
            graph[t] += 1
    graph = pd.DataFrame(graph.items(), columns=['date', 'mentions'])
    graph = graph.sort_values(by=['date'])
    fig = plt.figure(figsize=(20,10))
    plt.plot(graph['date'], graph['mentions'], '-', color='green')
    plt.xlabel('Dates')
    plt.ylabel('Number of mentions / tweets from organisation')
    plt.xticks(rotation=90)
    plt.yticks(np.arange(0, max(graph['mentions']), g_step))
    plt.title(name, bbox={'facecolor': '0.8', 'pad': 5})
    fig.savefig('data/graphs/' + filename + '.png')
    plt.close()
    plt.show()

def timeline(dataframe):
    for x in range(len(dataframe)):
        time = []
        org_mention = dataframe.loc[x]['account']
        org_account = re.sub('@', '', org_mention)
        org_local_name = dataframe.loc[x]['local_name']
        org_en_name = dataframe.loc[x]['english_name']

        for num, row in enumerate(data['text']):
            if type(org_mention) == str:
                if re.findall(org_mention, str(row)):
                    time.append(data.loc[num]['time'][:10])
            elif type(org_local_name) == str:
                if re.findall(org_local_name, str(row)):
                    time.append(data.loc[num]['time'][:10])
            elif type(org_en_name) == str:
                if re.findall(org_en_name, str(row)):
                    time.append(data.loc[num]['time'][:10])
        for num, row in enumerate(data['from_user']):
            if org_account == row:
                time.append(data.loc[num]['time'][:10])
        
        if len(time) >= 100 and len(time) < 200 :
            save_plot(time, 5, org_account, org_account)
        elif len(time) >= 200 and len(time) < 500 :
            save_plot(time, 10, org_account, org_account)
        elif len(time) >= 500 and len(time) < 1000 :
            save_plot(time, 30, org_account, org_account)
        elif len(time) >= 1000 and len(time) < 10000 :
            save_plot(time, 50, org_account, org_account)
        elif len(time) > 10000 :
            save_plot(time, 100, org_account, org_account)

In [9]:
#timeline(data_cat)
timeline(data_brazil)

Find top-50 most frequent messages from the datasets

In [14]:
top50 = {}

for text in data['text']:
    if text not in top50:
        top50[text] = 1
    else:
        top50[text] += 1

top50_freq = sorted(top50.items(), key=operator.itemgetter(1), reverse=True)