In [2]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
def find_next_date(date, month=True):
    if month:
        greater_and_equal_to = date + pd.DateOffset(months=1)
    else:
        greater_and_equal_to = date + pd.DateOffset(days=1)

    return greater_and_equal_to

def find_users_and_generate_new_dfs(date, next_date):
    df_before_date = authorid_date_sentiment_counts[authorid_date_sentiment_counts['created_at'] < date]
    df_after_next_date = authorid_date_sentiment_counts[authorid_date_sentiment_counts['created_at'] >= next_date]

    df_before_date['before_after'] = ['Before'] * df_before_date.shape[0]
    df_after_next_date['before_after'] = ['After'] * df_after_next_date.shape[0]

    users_before = df_before_date['author_ids'].values
    users_after = df_after_next_date['author_ids'].values

    common_users = set(users_before).intersection(users_after)

    return common_users, pd.concat([df_before_date, df_after_next_date], axis=0).reset_index(drop=True)

def creating_df(date, month=True):
    next_date = find_next_date(date, month=month)

    unique_users, filtered_df = find_users_and_generate_new_dfs(date, next_date)
    average = filtered_df[filtered_df['author_ids'].isin(unique_users)]
    average = average.groupby(['author_ids', 'before_after', 'sentiment_labels']).agg({'counts':'sum'}).reset_index()
    average = average.pivot_table(index=['author_ids', 'before_after'], columns='sentiment_labels',
                                  values='counts', dropna=False).fillna(0).reset_index()
    average['positive_ratio'] = average[1].values / (average[1].values + average[0].values)
    average = average[['author_ids', 'before_after', 'positive_ratio']].rename_axis(None, axis=1)
    return average

def create_diff_df(date, month=True):
    avrg = creating_df(date, month=month)
    avrg = avrg.pivot_table(index='author_ids', values='positive_ratio', columns='before_after').reset_index()
    avrg['diff'] = avrg['After'].values - avrg['Before'].values
    avrg = avrg.drop(columns=['Before', 'After'])
    avrg['date'] = [date] * avrg.shape[0]
    return avrg

In [None]:
from nltk.tokenize import RegexpTokenizer
TOKENIZER = RegexpTokenizer(r'\w+')

In [None]:
dataframes_path = "/data/processed/data_frames"

In [None]:
world_data_sentiments_raw = pd.read_parquet(f"{dataframes_path}/world_data_sentiments_raw.parquet")
author_ids_and_sentiments = pd.read_parquet(f"{dataframes_path}/author_ids_and_sentiments.parquet")

In [None]:
world_data_sentiments_raw['created_at'] = pd.to_datetime(world_data_sentiments_raw['created_at'].apply(lambda x: x[:10]))

world_data_sentiments_raw = world_data_sentiments_raw[['id', 'created_at', 'Anti', 'Pro']]

world_data_sentiments_raw = world_data_sentiments_raw.iloc[np.max(world_data_sentiments_raw.iloc[:, 2:].values, axis=1) >= 0.99, :]
world_data_sentiments_raw = world_data_sentiments_raw.drop(columns=['id', 'Anti', 'Pro'])

authorid_date_sentiment = pd.concat([world_data_sentiments_raw.reset_index(drop=True),
                                     author_ids_and_sentiments.reset_index(drop=True)], axis=1)

# save the dataframe
authorid_date_sentiment.to_parquet(f"{dataframes_path}/authorid_date_sentiment.parquet")

In [None]:
del world_data_sentiments_raw, author_ids_and_sentiments

### Author id - Date - Sentiment - Tweet Counts

In [None]:
authorid_tweetcount = authorid_date_sentiment.value_counts(['author_ids']).reset_index(name='counts')
more_than_2_tweets = set(authorid_tweetcount[authorid_tweetcount['counts'] != 1]['author_ids'].values)

# removing people who have only a single tweet
authorid_date_sentiment = authorid_date_sentiment[authorid_date_sentiment['author_ids'].isin(more_than_2_tweets)].reset_index(drop=True)

In [None]:
authorid_date_sentiment_counts = authorid_date_sentiment.value_counts().reset_index(name='counts')
authorid_date_sentiment_counts.to_parquet(f"{dataframes_path}/authorid_date_sentiment_counts.parquet", index=False)

### Daily

In [None]:
unique_dates = np.sort(authorid_date_sentiment_counts['created_at'].unique())[1:-1]
diff_dfs = []

for date in tqdm(pd.to_datetime(unique_dates)):
    diff_dfs.append(create_diff_df(date, month=False))

In [None]:
with open(f'{dataframes_path}/diff_dfs.pkl', 'wb') as f:
    pickle.dump(diff_dfs, f)