In [0]:
import pandas as pd
import re
import json
import os
import sys
import numpy as np

# load functions
with open('project_config.json','r') as fp: 
    project_config = json.load(fp)
 
module_path = os.path.join(project_config['project_module_relative_path'])
sys.path.append(module_path)
 
from data_processing import *

### Read in files

In [0]:
# Create a list for each row
json_data =[]

# Read in json by each lind
with open('./evidence-10-02-23-split.json') as f:
    for line in f:
        json_data.append(json.loads(line))

In [0]:
# Convert to dataframe
df = pd.DataFrame(json_data)

In [0]:
# Unpack the evidence from its json structure, so we get the text and themeIds, issueIds, id, and status all as its own column
evidence_list = []
for index, row in df.iterrows():
    evidence={}
    evidence["id"] = row["_id"]
    evidence["status"]= row["_source"].get("status", "")
    text_translated = row["_source"].get("textTranslated", {})
    evidence["text"] = text_translated.get("en", "")
    evidence["themeIds"] = row["_source"].get("themeIds", np.NaN)
    evidence["issueIds"] = row["_source"].get("issueIds", "")
    evidence_list.append(evidence)

In [0]:
# convert into dataframe
evidence=pd.DataFrame(evidence_list)

In [0]:
# read in audit data
audit = pd.read_csv("amp_audit.csv")

In [0]:
# clean audit columns
cols_to_clean = ["themeIdsReviewed", "themeIdsSystemFalseNegatives", "themeIdsSystemFalsePositives"]

for column in cols_to_clean:
    audit = string_to_list_column(audit, column)

In [0]:
# merge with new data
amp = pd.merge(evidence, audit, how = "left", on = "id")

In [0]:
# write to csv
amp.to_csv("model_training_data.csv", index = False)

# Old processing

In [0]:
evidence = pd.read_csv("evidence-09-22-23.csv")
evidence.columns  = evidence.columns.str.replace('[#,@,&]', '')

themes = pd.read_csv("themes.csv")
themes.columns  = themes.columns.str.replace('[#,@,&]', '')
themes.rename(columns = {"id": "themeIds", "nameTranslated.en": "themeName"}, inplace = True)

In [0]:
evidence['textTranslated']

#### Clean up datatype for evidence column. They are all being read in as one string instead of a list

In [0]:
# drop na values
df=evidence

cols_to_clean = ["themeIds"]

for column in cols_to_clean:
    df = string_to_list_column(df, column)

In [0]:
df['textTranslated'].apply(lambda x: json.loads(x)[0]['en'] if x else None)

In [0]:
df["textTranslated"]= df["textTranslated"].apply(lambda x: x if not isinstance(x, str) else x.strip('"'))

In [0]:
df["textTranslated"][0]

## Get theme names into dataframe

In [0]:
theme_dict = dict(zip(themes['themeIds'], themes['themeName']))

In [0]:
# Replace theme names with IDs in df
def replace_theme_ids(theme_ids):
    return [theme_dict[theme_id] for theme_id in theme_ids if theme_id in theme_dict]


In [0]:
df['themeIds'] = df['themeIds'].apply(lambda x: [theme_dict.get(i, 'default_value') for i in x if not pd.isna(i)])

### Clean text column

In [0]:
df.head()

In [0]:
df["textTranslated"][0]

In [0]:
df = clean_text(df, "text")

In [0]:
df.to_csv("amp.csv", index = False)