## Data Ingestion

- Currently using only Word and WordNet Extension columns
- Aggregating & grouping details into json for easy use

In [1]:
import pandas as pd
import json
import numpy as np

In [7]:
df = pd.read_csv('dict4dataset_06_21_22.csv')

In [8]:
df

Unnamed: 0,Strategy No.,Strategy Name,Category No.,Category Name,Semantic Anchor,Word,WordNet Extension,Phrase Extension,Reprogramming,ReprogramType,Popup_title,popup_feedback,Sidebar_feedback,Color(hex)
0,L2d,Understanding Common Symptom Indicators,1.0,Signs of Depression -> Depressed Mood or Dyshp...,Depressed Mood,depress,"depressed, depression, gloomy, grim, blue, dis...",/,,,,This is a common sign that it is time to care ...,"<p>Are you feeling depressed, anxious, stresse...",#d3cd57
1,L2d,Understanding Common Symptom Indicators,1.0,Signs of Depression -> Depressed Mood or Dyshp...,Dyshphoria,dyshphoria,"unhappy, uneasy, dissatisfied, discomfort, dis...",/,,,,This is a common sign that it is time to care ...,"<p>Are you feeling depressed, anxious, stresse...",#d3cd57
2,,,,,,,,,,,,,,
3,L2d,Understanding Common Symptom Indicators,2.0,Signs of Depression -> Hopeless Outlook,Hopeless,hopeless,"desperate, despairing","no help, don't help",,,,This is a common sign that it is time to care ...,"<p>Are you feeling depressed, anxious, stresse...",#d3cd57
4,L2d,Understanding Common Symptom Indicators,2.0,Signs of Depression -> Hopeless Outlook,Helpless,helpless,incapacitated,"nobody help, no one help",,,,This is a common sign that it is time to care ...,"<p>Are you feeling depressed, anxious, stresse...",#d3cd57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,L2a,Postive reframing,,,,I hate + pronoun,,,,,,,,
134,L2a,Postive reframing,,,,I can't,,,"in the past, but in the future",after,,,,
135,,,,,,,,,,,,,,
136,,,,,,,,,,,,,,


In [31]:
df_clean = df.replace({"Word": "/", "WordNet Extension": "/", "Phrase Extension": "/"}, np.nan) \
             .dropna(subset=['Category No.']) \
             .astype({'Category No.': 'int32'}) \
             .sort_values(["Strategy No."])

# df_clean = df.dropna(subset=['Category No.']) \
#              .astype({'Category No.': 'int32'}) \
#              .sort_values(["Strategy No."])

# for wordnet extension split at comma
df_clean['WordNet Extension'] = df_clean['WordNet Extension'].apply(lambda x: x.split(',') if isinstance(x,str) else x)
df_clean['Phrase Extension'] = df_clean['Phrase Extension'].apply(lambda x: x.split(',') if isinstance(x,str) else x)

# for reprogramming split at ;
df_clean['Reprogramming'] = df_clean['Reprogramming'].apply(lambda x: x.split(';') if isinstance(x,str) else x)

df_clean

Unnamed: 0,Strategy No.,Strategy Name,Category No.,Category Name,Semantic Anchor,Word,WordNet Extension,Phrase Extension,Reprogramming,ReprogramType,Popup_title,popup_feedback,Sidebar_feedback,Color(hex)
112,L1b,Core Values,2,Negative Emotion,,"sad, depressed, heartbreak, heartbroken",,,[I have these feelings maybe because I value ....,after,,,,
110,L1b,Core Values,1,Dysfunctional Self-Talk,,,,,,,,,,
99,L1b,Core Values,1,Dysfunctional Self-Talk,,loser,,,[My inner value behind these is],,,,,
101,L1b,Core Values,1,Dysfunctional Self-Talk,,hate,,,,,,,,
102,L1b,Core Values,1,Dysfunctional Self-Talk,,lazy,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,L2f,Understanding Healthy Patterns,2,Coherent narrative words,,therefore,,,,,,,,
91,L2f,Understanding Healthy Patterns,2,Coherent narrative words,,because,,,,,,,,
88,L2f,Understanding Healthy Patterns,1,Insights words,,understand,,,,,,,,
87,L2f,Understanding Healthy Patterns,1,Insights words,,realize,,,,,,,,


In [32]:
df_clean.columns

Index(['Strategy No.', 'Strategy Name', 'Category No.', 'Category Name',
       'Semantic Anchor', 'Word', 'WordNet Extension', 'Phrase Extension',
       'Reprogramming', 'ReprogramType', 'Popup_title', 'popup_feedback',
       'Sidebar_feedback', 'Color(hex)'],
      dtype='object')

In [33]:
groupped = df_clean.groupby(["Strategy No.", "Category No.", "Category Name", "Semantic Anchor"], dropna=False) \
                   .apply(lambda x: x[["Word","WordNet Extension","Phrase Extension", "Reprogramming", "ReprogramType", "Popup_title", "popup_feedback","Sidebar_feedback", "Color(hex)"]].to_json(orient='columns'))

In [34]:
isinstance(groupped, pd.Series)

True

In [35]:
# groupped.groups
# groupped.index
for index, value in groupped.items():
    print(f"Index : {index}, Value : {json.loads(value)}")

Index : ('L1b', 1, 'Dysfunctional Self-Talk', nan), Value : {'Word': {'110': None, '99': 'loser', '101': 'hate', '102': 'lazy', '103': 'the worst', '100': 'suck', '105': 'failure', '106': 'pathetic', '107': 'good-for-nothing', '108': 'dumb', '109': 'stupid', '104': 'useless'}, 'WordNet Extension': {'110': None, '99': None, '101': None, '102': None, '103': None, '100': None, '105': None, '106': None, '107': None, '108': None, '109': None, '104': None}, 'Phrase Extension': {'110': None, '99': None, '101': None, '102': None, '103': None, '100': None, '105': None, '106': None, '107': None, '108': None, '109': None, '104': None}, 'Reprogramming': {'110': None, '99': ['My inner value behind these is'], '101': None, '102': None, '103': None, '100': None, '105': None, '106': None, '107': None, '108': None, '109': None, '104': None}, 'ReprogramType': {'110': None, '99': None, '101': None, '102': None, '103': None, '100': None, '105': None, '106': None, '107': None, '108': None, '109': None, '10

In [36]:
agg = []

for index, value in groupped.items():
    out = { 
        "strategy_code" : index[0],
        "category_number" : index[1],
        "semantic_anchor" : index[2],
        "words" : [i for i in list(json.loads(value)["Word"].values()) if i],
        "wordnet_ext" : [i for i in list(json.loads(value)["WordNet Extension"].values()) if i],
        "phrase_ext" : [i for i in list(json.loads(value)["Phrase Extension"].values()) if i],
        "rewrite" : list(json.loads(value)["Reprogramming"].values())[0],
        "rewrite_position" : list(json.loads(value)["ReprogramType"].values())[0],
        # "brief_feedback" : [*json.loads(value)["Brief Feedback"].values()][0],
        # "longer_feedback" : [*json.loads(value)["Longer Feedback"].values()][0],
        # "color" : [*json.loads(value)["Color"].values()][0],
        "popup_title" : [*json.loads(value)["Popup_title"].values()][0],
        "popup_feedback" : [*json.loads(value)["popup_feedback"].values()][0],
        "Sidebar_feedback" : [*json.loads(value)["Sidebar_feedback"].values()][0],
        "color" : [*json.loads(value)["Color(hex)"].values()][0],
    }
    out["wordnet_ext"] = [item.strip() for sublist in out["wordnet_ext"] for item in sublist]
    out["phrase_ext"] = [item.strip() for sublist in out["phrase_ext"] for item in sublist]
    agg.append(out)
    
# print(agg)
# print(json.loads(agg))

## FOR SAVING:

In [37]:
import datetime
ts = datetime.datetime.now().isoformat()

In [38]:
with open(ts+'dict.json', 'w', encoding='utf-8') as f:
    json.dump(agg, f, ensure_ascii=False, indent=4)