# CTM Result Analysis and Data Cleaning

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool

from src.scripts.tools import Tools
from src.scripts.read_data import ReadData

In [None]:
import nltk
nltk.download('all')
from nltk.corpus import stopwords

from nltk.tokenize import RegexpTokenizer
TOKENIZER = RegexpTokenizer(r'\w+')
tokenize_data = np.vectorize(lambda x: len(TOKENIZER.tokenize(x)))

### Cumulative Distribution

In [None]:
raw_data_path = "/data/raw"
ctm_data_path = "/data/processed/CTM"
probs_saving_path = "/data/processed/CTM/probs"
dataframes_path = "/data/processed/data_frames"
topics_saving_path = "/data/processed/CTM/topics"
world_data_path = "/data/raw/daily_world_en_csv"

In [None]:
topics_predictions_df = pd.read_parquet(f"{probs_saving_path}/probs_8_200_0.2.parquet")

### Cleaning and Saving

In [None]:
margin = 1/8

sorted_preds = np.sort(topics_predictions_df, axis=1)
greater_than_margin = (sorted_preds[:, -1] - sorted_preds[:, -2]) > margin

In [None]:
new_topics_predictions_df = topics_predictions_df.loc[greater_than_margin, :]

In [None]:
new_topics_predictions_df.to_parquet(f"{probs_saving_path}/cleaned_probs_8_200_0.2.parquet", index=False)

### Cleaning world anti tweets

In [None]:
sp_preprocess_results = pd.read_parquet(f"{ctm_data_path}/sp_preprocess_results.parquet")
retained_indices = sp_preprocess_results['retained_indices'].values

del sp_preprocess_results

In [None]:
cleaned_indices = np.array(new_topics_predictions_df.index)
cleaned_indices = retained_indices[cleaned_indices]

In [None]:
world_anti_tweets = pd.read_parquet(f"{raw_data_path}/world_anti_tweets_and_ids.parquet")
cleaned_world_anti_tweets = world_anti_tweets.iloc[cleaned_indices, :]

In [None]:
cleaned_world_anti_tweets['cleaned_indices'] = cleaned_indices

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
cleaned_world_anti_tweets.to_parquet(f"{raw_data_path}/cleaned_world_anti_tweets_and_ids.parquet", index=False)

# CTM DFS

In [None]:
cleaned_world_anti_tweets = pd.read_parquet(f"{raw_data_path}/cleaned_world_anti_tweets_and_ids.parquet")
new_topics_predictions_df = pd.read_parquet(f"{probs_saving_path}/cleaned_probs_8_200_0.2.parquet")

In [None]:
data_path = "/data/raw/daily_world_en_csv"
list_of_data = os.listdir(data_path)
list_of_data.sort()

In [None]:
def read_files(file_name):
    df = pd.read_csv(f"{data_path}/{file_name}")
    df = df[df['text'].notnull()]
    df = df[tokenize_data(df['text'].values) >= 10]
    return df[['id', 'created_at']]

In [None]:
with Pool(processes=8) as pool:
    tweets_with_dates = pool.map(read_files, list_of_data)

In [None]:
tools = Tools()
id_date = tools.concatenate_data(tweets_with_dates, 10, concat_type='pd')

100%|██████████| 73/73 [00:32<00:00,  2.27it/s]


### date, topic, count

In [None]:
cleaned_world_anti_tweets = cleaned_world_anti_tweets.join(id_date.set_index('id'), on='id')

In [None]:
format_date = np.vectorize(lambda x: x[:10])
cleaned_world_anti_tweets['created_at'] = pd.to_datetime(format_date(cleaned_world_anti_tweets['created_at'].values))

In [None]:
date_topic = pd.DataFrame({'date': cleaned_world_anti_tweets['created_at'],
                                 'topic': np.argmax(new_topics_predictions_df.values, axis=1) + 1})

date_topic.to_parquet(f"{dataframes_path}/date_and_topic_8_200_0.2.parquet", index=False) # not used

In [None]:
date_topic_count = date_topic.value_counts().reset_index().rename(columns={0: 'counts'}).sort_values('date')

date_topic_count.to_parquet(f"{dataframes_path}/date_topic_count_8_200_0.2.parquet", index=False)

### day, keyword, count

In [None]:
cleaned_world_anti_tweets = pd.read_parquet(f"{raw_data_path}/cleaned_world_anti_tweets_and_ids.parquet")
date_topic = pd.read_parquet(f"{dataframes_path}/date_and_topic_8_200_0.2.parquet")
top_50_keywords = pd.read_parquet(f"{topics_saving_path}/topics_8_200_0.2.parquet")

In [None]:
day_keyword_count_path = f"{topics_saving_path}/day_keyword_count_for_topics_8_200_0.2"
os.makedirs(day_keyword_count_path, exist_ok=True)

In [None]:
unique_date_df = pd.DataFrame({'date':date_topic['date'].unique(),
                               'dummy_value':np.arange(len(date_topic['date'].unique()))}).sort_values('date')

date_text_topic = pd.concat([cleaned_world_anti_tweets, date_topic], axis=1)[['date', 'text', 'topic']]

In [None]:
for topic in tqdm(range(1, 9)):
    topic_df = date_text_topic[date_text_topic['topic'] == topic]

    topic_keywords = None
    for keyword in top_50_keywords[f'topic_{topic}']:
        keyword_df = topic_df[topic_df['text'].apply(lambda x: keyword in x)]
        keyword_df = keyword_df.value_counts(['date']).reset_index(name='count')
        keyword_df = unique_date_df.join(keyword_df.set_index('date'), on='date')
        keyword_df = keyword_df[['date', 'count']].fillna(0)
        keyword_df['keyword'] = np.array([keyword]*keyword_df.shape[0])
        keyword_df = keyword_df[['date', 'keyword', 'count']]

        if topic_keywords is None:
            topic_keywords = keyword_df
        else:
            topic_keywords = pd.concat([topic_keywords, keyword_df], axis=0)

    topic_keywords.to_parquet(f"{day_keyword_count_path}/topic{topic}.parquet", index=False)

### Date, bi-gram, count

In [None]:
new_topics_predictions_df = pd.read_parquet(f"{probs_saving_path}/cleaned_probs_8_200_0.2.parquet")
cleaned_world_anti_tweets = pd.read_parquet(f"{raw_data_path}/cleaned_world_anti_tweets_and_ids.parquet")

ids_and_topics = pd.DataFrame({'id': cleaned_world_anti_tweets['id'].values,
                               'topic': np.argmax(new_topics_predictions_df.values, axis=1) + 1})

del new_topics_predictions_df, cleaned_world_anti_tweets

In [None]:
read_world_data = ReadData(world_data_path, ['id', 'created_at', 'text'], filter_tweets=True)

read_world_data.read_csvs_and_combine_data()

  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
  return list(map(*args))
100%|██████████| 73/73 [01:35<00:00,  1.30s/it]


In [None]:
read_world_data.data = read_world_data.data[read_world_data.data['id'].isin(set(ids_and_topics['id'].values.tolist()))].reset_index(drop=True)

In [None]:
id_date_text_topic = pd.read_parquet(f'{dataframes_path}/id_date_text_topic.parquet')

In [None]:
TOKENIZER = RegexpTokenizer(r"[\w']+")
stop_words = set(stopwords.words('english'))

def word_filter(text):
    filtered_text = []
    for word in TOKENIZER.tokenize(text):
        if len(word) > 1 and word not in stop_words:
            filtered_text.append(word)
    
    return ' '.join(filtered_text)

In [None]:
filtered_id_date_text_topic = id_date_text_topic.copy(deep=True)

filtered_text = []
for text in tqdm(filtered_id_date_text_topic['text'].values):
    filtered_text.append(word_filter(text))

100%|██████████| 6437352/6437352 [01:38<00:00, 65061.66it/s]


In [None]:
filtered_id_date_text_topic = filtered_id_date_text_topic.drop(columns=['text'])
filtered_id_date_text_topic = pd.concat([filtered_id_date_text_topic, pd.DataFrame({'text': filtered_text})], axis=1)
del filtered_text

In [None]:
from nltk.stem import WordNetLemmatizer

pos_encoding = {'NN':'n', 'NNS':'n', 'NNP':'n', 'NNPS':'n', 'PRP':'n',
                'PRP$':'n', 'VB':'v', 'VBD':'v', 'VBG':'v', 'VBN':'v',
                'VBP':'v', 'VBZ':'v', 'JJ':'a', 'JJR':'a', 'JJS':'a', 'RB':'r',
                'RBR':'r', 'RBS':'r'}

lemmatizer = WordNetLemmatizer()

In [None]:
from nltk import pos_tag_sents

tagged_text = pos_tag_sents([text.split() for text in filtered_id_date_text_topic['text'].values])

In [None]:
filtered_text = []
for text in tqdm(tagged_text):
    sub_cleaned_text = []
    for word in text:
        if pos_encoding.get(word[1], False):
            lemmatized_word = lemmatizer.lemmatize(word[0], pos=pos_encoding[word[1]])
        else:
            lemmatized_word = lemmatizer.lemmatize(word[0])

        sub_cleaned_text.append(lemmatized_word)
    filtered_text.append(' '.join(sub_cleaned_text))

100%|██████████| 6437352/6437352 [08:35<00:00, 12478.67it/s]


In [None]:
filtered_id_date_text_topic = filtered_id_date_text_topic.drop(columns=['text'])
filtered_id_date_text_topic = pd.concat([filtered_id_date_text_topic, pd.DataFrame({'text':filtered_text})], axis=1)

In [None]:
filtered_id_date_text_topic['created_at'] = filtered_id_date_text_topic['created_at'].astype('str').apply(lambda x: x[:10])

In [None]:
collocations = {2:['blood', 'clot'], 4:['flu', 'shot'], 5:['long', 'term', 'side', 'effect'],
                6:['big', 'pharma'], 7:['immune', 'system']}

In [None]:
bigram_counter = {}

for topic, collocation in tqdm(collocations.items()):
    collocation_str = ' '.join(collocation)
    bigram_counter[collocation_str] = {}
    for idx, row_serie in filtered_id_date_text_topic[filtered_id_date_text_topic['topic'] == topic].iterrows():
        bigram_counter[collocation_str].setdefault(row_serie['created_at'], 0)
        splitted_text = row_serie['text'].lower().split()

        is_in = True
        for word in collocation:
            if word not in splitted_text:
                is_in = False
                break

        if is_in:
            bigram_counter[collocation_str][row_serie['created_at']] += 1

100%|██████████| 5/5 [02:44<00:00, 32.96s/it]


In [None]:
unique_dates = filtered_id_date_text_topic['created_at'].unique()
keywords = list(bigram_counter.keys())

date_and_bigram_collocations_df = pd.DataFrame({'date':unique_dates,
              keywords[0]: [bigram_counter[keywords[0]][date] if date in bigram_counter[keywords[0]] else 0 for date in unique_dates],
              keywords[1]: [bigram_counter[keywords[1]][date] if date in bigram_counter[keywords[1]] else 0 for date in unique_dates],
              keywords[2]: [bigram_counter[keywords[2]][date] if date in bigram_counter[keywords[2]] else 0 for date in unique_dates],
              keywords[3]: [bigram_counter[keywords[3]][date] if date in bigram_counter[keywords[3]] else 0 for date in unique_dates],
              keywords[4]: [bigram_counter[keywords[4]][date] if date in bigram_counter[keywords[4]] else 0 for date in unique_dates]})

In [None]:
date_and_bigram_collocations_df.to_parquet(f"{dataframes_path}/date_and_bigram_collocations_df.parquet", index=False)