In [0]:
import pandas as pd
import re
import json
import os
import sys
import numpy as np
import pickle

# load functions
with open('project_config.json','r') as fp: 
    project_config = json.load(fp)
 
module_path = os.path.join(project_config['project_module_relative_path'])
sys.path.append(module_path)
 
from data_processing import *

### Read in files

In [0]:
# Create a list for each row
json_data =[]

# Read in json by each lind
with open('./evidence-10-02-23-split.json') as f:
    for line in f:
        json_data.append(json.loads(line))

In [0]:
# Convert to dataframe
df = pd.DataFrame(json_data)

In [0]:
# Unpack the evidence from its json structure, so we get the text and themeIds, issueIds, id, and status all as its own column
evidence_list = []
for index, row in df.iterrows():
    evidence={}
    evidence["id"] = row["_id"]
    evidence["status"]= row["_source"].get("status", "")
    text_translated = row["_source"].get("textTranslated", {})
    evidence["text"] = text_translated.get("en", "")
    evidence["themeIds"] = row["_source"].get("themeIds", np.NaN)
    evidence["issueIds"] = row["_source"].get("issueIds", "")
    evidence_list.append(evidence)

In [0]:
# convert into dataframe
evidence=pd.DataFrame(evidence_list)

### Read in audit data

In [0]:
f = open('amp_audit_27oct.json')
audit_json = json.load(f)
audit = pd.DataFrame(audit_json)
audit.rename(columns={"_id": "id"}, inplace = True)

In [0]:
# read in audit data
audit2 = pd.read_csv("audit_10042023.csv")

In [0]:
# clean audit columns
cols_to_clean = ["themeIdsReviewed", "themeIdsSystem","themeIdsSystemFalseNegatives", "themeIdsSystemFalsePositives"]

for column in cols_to_clean:
    audit2 = string_to_list_column(audit2, column)

audit2.rename(columns={"themeIdsSystem": "themeIds"}, inplace = True)
audit2["themeConfidence"] = ""

In [0]:
audit_columns=['id',
 'textTranslated.en',
 'themeIdsReviewed',
 'rawPredictions',
 'themeIds',
 'themeConfidence',
 'themeIdsSystemFalseNegatives',
 'themeIdsSystemFalsePositives'
 ]

In [0]:
audit_all = pd.concat([audit[audit_columns], audit2[audit_columns]])

In [0]:
audit_all.dtypes

In [0]:
na= audit_all[audit_all["themeIds"].isna()].reset_index()

In [0]:
na

In [0]:
evidence.columns

In [0]:
# merge with new data
amp = pd.merge(evidence, audit_all, how = "outer", on = "id")

In [0]:
amp["themeIds"] = amp['themeIds_x'].fillna(amp['themeIds_y'])

In [0]:
amp.shape

In [0]:
amp.head()

# Create a label cutoff

In [0]:
amp["themeIds"] = amp['themeIds'].apply(lambda themes: [theme for theme in themes if theme != 'rfi'] if isinstance(themes, list) else themes)
amp["themeIdsReviewed"] = amp['themeIdsReviewed'].apply(lambda themes: [theme for theme in themes if theme != 'rfi'] if isinstance(themes, list) else themes)
amp["themeIds2"]=amp["themeIds"].apply(lambda x: x[:2] if isinstance(x, list) else x)


# Get labeled split

In [0]:
amp["themeIdsReviewed"] = amp["themeIdsReviewed"].fillna("")
amp["split"] = "unlabeled"
mask = amp["themeIdsReviewed"].apply(lambda x: len(x)) > 0
amp.loc[mask, "split"] = "labeled"

In [0]:
amp["split"].value_counts()

# Get parent themes

In [0]:
amp['themeIdsReviewedParent'] = amp['themeIdsReviewed'].apply(map_themes)
amp['themeIdsParent'] = amp['themeIds'].apply(map_themes)
amp['themeIdsParent2'] = amp['themeIds2'].apply(map_themes)

In [0]:
amp["themeIdsReviewedParent"].explode().value_counts()

In [0]:
# write to picklea
amp.to_pickle("./model_training_data_vax.pkl")  
audit_all.to_pickle("./audit_model_training_data_vax.pkl")  

## Quick EDA

In [0]:
amp["themeIdsReviewedParent"]

In [0]:
cc  = amp[amp['themeIdsReviewedParent'].apply(lambda x: x is not None and 'conspiracy-corruption' in x)]

In [0]:
cc.dropna(subset = "text", inplace = True)

In [0]:
cc["textTranslated.en"]

In [0]:
#Vectorization libraries 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [0]:
#From the eda, remove popular words that are not that significant 
common_words = ["feel", "like", "just", "im", "know", "need", "tl", 
               "dr", "tldr","amp", "nbsp", "really", "mom", "mum"
               , "parent", "nparent", "nmom", "dad", "father", 
                "brother", "ex", "nex", "sister", "ns", "mother"
               "because", "wife", "girlfriend", "husband", "boyfriend", "gf", "bf", "ive", "ve", "don"
               "his", "she", "shes", "hes", "https", "www", "com", "spotify", "youtube", "user", "covid", "19", "rt"]

time_words = ["time", "today", "tomorrow", "yesterday", "morning", 
              "afternoon", "night", "day", "week", "weekend", "month",
             "year", "days", "weeks", "weekends", "months", "years", 
              "monday", "tuesday", "wednesday", "thursday", "friday", 
              "saturday", "sunday", "mon", "tues", "wed", "thurs", "fri",
              "sat", "sun", "january", "february", "march", "april", "may",
              "june", "july", "august", "september", "october", "november",
              "decemebr", "jan", "feb", "mar", "april", "may", "jun", "jul",
              "aug", "sep", "oct", "nov", "dec"]
        

#add both lists together
additional_stop_words = common_words + time_words

english_stop_words = text.ENGLISH_STOP_WORDS
combined_stop_words = english_stop_words.union(additional_stop_words)

stop_words_list = list(combined_stop_words)


In [0]:
cvec = CountVectorizer(stop_words=stop_words_list, min_df=0.0001, ngram_range=(1, 1), max_df=1.0)

In [0]:
term_mat = cvec.fit_transform(cc["textTranslated.en"])

In [0]:
term_df = pd.DataFrame(term_mat.toarray(), columns=cvec.get_feature_names_out())


In [0]:
term_df.columns

In [0]:
#Dataframe for the 30 most frequently used terms
top_terms = term_df.mean().T.sort_values(0, ascending = False).head(30)

In [0]:
top_terms

In [0]:
ill  = amp[amp['themeIdsReviewedParent'].apply(lambda x: x is not None and 'illness-cause' in x)]

In [0]:
term_mat2 = cvec.fit_transform(ill["textTranslated.en"])

In [0]:
term_df2 = pd.DataFrame(term_mat2.toarray(), columns=cvec.get_feature_names_out())


In [0]:
term_df2.mean().T.sort_values(0, ascending = False).head(30)

In [0]:
ill