In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool
from src.scripts.tools import Tools
from concurrent.futures import ThreadPoolExecutor

In [None]:
from nltk.tokenize import RegexpTokenizer
TOKENIZER = RegexpTokenizer(r'\w+')

In [None]:
dataframes_path = "/data/processed/data_frames"
world_data_path = "/data/raw/daily_world_en_csv"
sentiments_path = "/data/processed/sentiment_analysis/all_world_raw"
combined_us_tweets_path = "/data/raw/US_data_csv_joined_with_locations"

In [None]:
def read_us_tweets(file_name):
    pbar.update()
    df = pd.read_csv(f"{combined_us_tweets_path}/{file_name}")
    return df[['id', 'state']]

def read_world_tweets(file_name):
    df = pd.read_csv(f"{world_data_path}/{file_name}")
    df = df[df['text'].notnull()]
    df = df[df['text'].apply(lambda x: len(TOKENIZER.tokenize(x)) >= 10)]
    return df[['id', 'created_at', 'author_id']]

def read_sentiment_results(file_name):
    pbar.update()
    return pd.read_csv(f"{sentiments_path}/{file_name}")

Initially we will create 2 df which the columns will be:

- state, tweet count for each sentiment
- date, tweet count for each sentiment

In [None]:
us_tweets_list = sorted(os.listdir(combined_us_tweets_path))
world_data_list = sorted(os.listdir(world_data_path))
sentiments_list = [f"between-{i*1200}-{(i+1)*1200}.csv" for i in range(50866)]

In [None]:
pbar = tqdm(total=len(us_tweets_list))
with ThreadPoolExecutor() as executor:
    us_tweets = list(executor.map(read_us_tweets, us_tweets_list))
pbar.close()

with Pool() as pool:
    world_data = pool.map(read_world_tweets, world_data_list)

pbar = tqdm(total=len(sentiments_list))
with ThreadPoolExecutor() as executor:
    sentiments = list(executor.map(read_sentiment_results, sentiments_list))
pbar.close()

In [None]:
tools = Tools()

us_tweets = tools.concatenate_data(us_tweets, 5, concat_type='pd', axis=0)
world_data = tools.concatenate_data(world_data, 5, concat_type='pd', axis=0)
sentiments = tools.concatenate_data(sentiments, 100, concat_type='pd', axis=0)

100%|██████████| 146/146 [01:07<00:00,  2.16it/s]


In [None]:
world_data_sentiments = pd.concat([world_data.reset_index(drop=True), sentiments.reset_index(drop=True)], axis=1)
us_tweets_sentiments = us_tweets.join(world_data_sentiments.drop(columns=['created_at']).set_index('id'), on='id').dropna()

In [None]:
world_data_sentiments.to_parquet(f"{dataframes_path}/world_data_sentiments_raw.parquet", index=False)
us_tweets_sentiments.to_parquet(f"{dataframes_path}/us_tweets_sentiments_raw.parquet", index=False)

# Preparing Data For Analysis

### Date and Sentiment Counts

In [None]:
mapping = {0:"Rest", 1:"Pro", 2:"Anti"}

date_and_sentiment_counts = world_data_sentiments.drop(columns=['id'])
max_columns = np.argmax(date_and_sentiment_counts.iloc[:, 1:].values, axis=1)
max_values = np.max(date_and_sentiment_counts.iloc[:, 1:].values, axis=1)
date_and_sentiment_counts['label'] = np.array([mapping[i] for i in max_columns])
date_and_sentiment_counts = date_and_sentiment_counts[max_values >= 0.99].reset_index(drop=True)

In [None]:
date_and_sentiment_counts = date_and_sentiment_counts.drop(columns=['Rest', 'Pro', 'Anti'])
date_and_sentiment_counts['created_at'] = pd.to_datetime(date_and_sentiment_counts['created_at'].apply(lambda x: x[:10]))
date_and_sentiment_counts = date_and_sentiment_counts.value_counts(['created_at', 'label']).reset_index().rename(columns={0:'counts'})

In [None]:
date_and_sentiment_counts.to_parquet(f"{dataframes_path}/date_and_sentiment_counts.parquet", index=False)

### State and Sentiment Counts

In [None]:
state_and_sentiment_counts = us_tweets_sentiments.drop(columns=['id'])
max_columns = np.argmax(state_and_sentiment_counts.iloc[:, 1:].values, axis=1)
max_values = np.max(state_and_sentiment_counts.iloc[:, 1:].values, axis=1)
state_and_sentiment_counts['label'] = np.array([mapping[i] for i in max_columns])
state_and_sentiment_counts = state_and_sentiment_counts[max_values >= 0.99].reset_index(drop=True)

In [None]:
state_and_sentiment_counts = state_and_sentiment_counts.drop(columns=['Rest', 'Pro', 'Anti'])
state_and_sentiment_counts = state_and_sentiment_counts.value_counts(['state', 'label']).reset_index().rename(columns={0:'counts'})

In [None]:
state_and_sentiment_counts.to_parquet(f"{dataframes_path}/state_and_sentiment_counts.parquet", index=False)