## Data Ingestion

- Currently using only Word and WordNet Extension columns
- Aggregating & grouping details into json for easy use

In [62]:
import pandas as pd
import json
import numpy as np

In [63]:
df = pd.read_csv('dict4dataset_05-30.csv')

In [64]:
df

Unnamed: 0,Strategy Name,Strategy No.,Category No.,Category Name,Semantic Anchor,Word,WordNet Extension,Phrase Extension,Reprogramming,ReprogramType,Source,Brief Feedback,Longer Feedback,Color
0,Understanding Common Symptom Indicators,L2d,1.0,Signs of Depression -> Depressed Mood or Dyshp...,Depressed Mood,depress,/,,,,Wikipedia,Depression is a mental state of low mood and a...,Classified medically as a mental and behaviora...,#140c1c
1,Understanding Common Symptom Indicators,L2d,1.0,Signs of Depression -> Depressed Mood or Dyshp...,Depressed Mood,depressed,"gloomy grim, blue, dispirited, down, downcast,...",,,,,Depression is a mental state of low mood and a...,Classified medically as a mental and behaviora...,#140c1c
2,Understanding Common Symptom Indicators,L2d,1.0,Signs of Depression -> Depressed Mood or Dyshp...,Depressed Mood,depression,/,,,,,Depression is a mental state of low mood and a...,Classified medically as a mental and behaviora...,#140c1c
3,Understanding Common Symptom Indicators,L2d,1.0,Signs of Depression -> Depressed Mood or Dyshp...,Dyshphoria,,,,,,,Depression is a mental state of low mood and a...,Classified medically as a mental and behaviora...,#140c1c
4,Understanding Common Symptom Indicators,L2d,2.0,Signs of Depression -> Hopeless Outlook,hopeless,hopeless,/,no/don't + help,But ...,after,,Hopelessness is an emotion characterized by a ...,Hopelessness is a powerful emotion that often ...,#442434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,Understanding Healthy Patterns,L2f,2.0,Coherent narrative words,,therefore,,,,,,,,#aed65a
79,Understanding Healthy Patterns,L2f,3.0,Constructive Self-Talk,,help,,,,,,,,#44aa38
80,Understanding Healthy Patterns,L2f,3.0,Constructive Self-Talk,,improve,,,,,,,,#44aa38
81,Understanding Healthy Patterns,L2f,3.0,Constructive Self-Talk,,work on,,,,,,,,#44aa38


In [81]:
df_clean = df.replace({"Word": "/", "WordNet Extension": "/"}, np.nan) \
             .dropna(subset=['Category No.']) \
             .astype({'Category No.': 'int32'}) \
             .sort_values(["Strategy No."])

# df_clean = df.dropna(subset=['Category No.']) \
#              .astype({'Category No.': 'int32'}) \
#              .sort_values(["Strategy No."])

# for wordnet extension split at comma
df_clean['WordNet Extension'] = df_clean['WordNet Extension'].apply(lambda x: x.split(',') if isinstance(x,str) else x)
df_clean['Phrase Extension'] = df_clean['Phrase Extension'].apply(lambda x: x.split(',') if isinstance(x,str) else x)

# for reprogramming split at ;
df_clean['Reprogramming'] = df_clean['Reprogramming'].apply(lambda x: x.split(';') if isinstance(x,str) else x)

df_clean

Unnamed: 0,Strategy Name,Strategy No.,Category No.,Category Name,Semantic Anchor,Word,WordNet Extension,Phrase Extension,Reprogramming,ReprogramType,Source,Brief Feedback,Longer Feedback,Color
31,Understanding Judgement,L2a,1,Positive Adjectives,,brave,,[xx is adj],[... because],after,,,,#597dce
32,Understanding Judgement,L2a,2,Negative Adjectives,,egoistic,,,,,,,,#d27d2c
35,Understanding Cognitive Distortion,L2b,1,Should Statement,should,should,,,"[can, choose, want to, prefer, would like to, ...",replace,,,,#8595a1
37,Understanding Cognitive Distortion,L2b,2,All or Nothing Thinking / Overgeneralization,absolutive words,never,,,,,,,,#6daa2c
38,Understanding Cognitive Distortion,L2b,2,All or Nothing Thinking / Overgeneralization,absolutive words,always,,,,,,,,#6daa2c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,Understanding Healthy Patterns,L2f,2,Coherent narrative words,,because,,,,,,,,#aed65a
78,Understanding Healthy Patterns,L2f,2,Coherent narrative words,,therefore,,,,,,,,#aed65a
79,Understanding Healthy Patterns,L2f,3,Constructive Self-Talk,,help,,,,,,,,#44aa38
80,Understanding Healthy Patterns,L2f,3,Constructive Self-Talk,,improve,,,,,,,,#44aa38


In [82]:
df_clean.columns

Index(['Strategy Name', 'Strategy No.', 'Category No.', 'Category Name',
       'Semantic Anchor', 'Word', 'WordNet Extension', 'Phrase Extension',
       'Reprogramming', 'ReprogramType', 'Source', 'Brief Feedback',
       'Longer Feedback', 'Color'],
      dtype='object')

In [83]:
groupped = df_clean.groupby(["Strategy No.", "Category No.", "Category Name", "Semantic Anchor"], dropna=False) \
                   .apply(lambda x: x[["Word","WordNet Extension","Phrase Extension", "Reprogramming", "ReprogramType", "Brief Feedback", "Longer Feedback", "Color"]].to_json(orient='columns'))

In [84]:
isinstance(groupped, pd.Series)

True

In [85]:
# groupped.groups
# groupped.index
for index, value in groupped.items():
    print(f"Index : {index}, Value : {json.loads(value)}")

Index : ('L2a', 1, 'Positive Adjectives', nan), Value : {'Word': {'31': 'brave'}, 'WordNet Extension': {'31': None}, 'Phrase Extension': {'31': ['xx is adj']}, 'Reprogramming': {'31': ['... because']}, 'ReprogramType': {'31': 'after'}, 'Brief Feedback': {'31': None}, 'Longer Feedback': {'31': None}, 'Color': {'31': '#597dce'}}
Index : ('L2a', 2, 'Negative Adjectives', nan), Value : {'Word': {'32': 'egoistic'}, 'WordNet Extension': {'32': None}, 'Phrase Extension': {'32': None}, 'Reprogramming': {'32': None}, 'ReprogramType': {'32': None}, 'Brief Feedback': {'32': None}, 'Longer Feedback': {'32': None}, 'Color': {'32': '#d27d2c'}}
Index : ('L2b', 1, 'Should Statement', 'should'), Value : {'Word': {'35': 'should'}, 'WordNet Extension': {'35': None}, 'Phrase Extension': {'35': None}, 'Reprogramming': {'35': ['can', 'choose', 'want to', 'prefer', 'would like to', 'plan to']}, 'ReprogramType': {'35': 'replace'}, 'Brief Feedback': {'35': None}, 'Longer Feedback': {'35': None}, 'Color': {'35'

In [86]:
agg = []

for index, value in groupped.items():
    out = { 
        "strategy_code" : index[0],
        "category_number" : index[1],
        "semantic_anchor" : index[2],
        "words" : [i for i in list(json.loads(value)["Word"].values()) if i],
        "wordnet_ext" : list(json.loads(value)["WordNet Extension"].values())[0],
        "phrase_ext" : list(json.loads(value)["Phrase Extension"].values())[0],
        "rewrite" : list(json.loads(value)["Reprogramming"].values())[0],
        "rewrite_position" : list(json.loads(value)["ReprogramType"].values())[0],
        "brief_feedback" : [*json.loads(value)["Brief Feedback"].values()][0],
        "longer_feedback" : [*json.loads(value)["Longer Feedback"].values()][0],
        "color" : [*json.loads(value)["Color"].values()][0],
    }
    agg.append(out)
    
print(agg)

[{'strategy_code': 'L2a', 'category_number': 1, 'semantic_anchor': 'Positive Adjectives', 'words': ['brave'], 'wordnet_ext': None, 'phrase_ext': ['xx is adj'], 'rewrite': ['... because'], 'rewrite_position': 'after', 'brief_feedback': None, 'longer_feedback': None, 'color': '#597dce'}, {'strategy_code': 'L2a', 'category_number': 2, 'semantic_anchor': 'Negative Adjectives', 'words': ['egoistic'], 'wordnet_ext': None, 'phrase_ext': None, 'rewrite': None, 'rewrite_position': None, 'brief_feedback': None, 'longer_feedback': None, 'color': '#d27d2c'}, {'strategy_code': 'L2b', 'category_number': 1, 'semantic_anchor': 'Should Statement', 'words': ['should'], 'wordnet_ext': None, 'phrase_ext': None, 'rewrite': ['can', 'choose', 'want to', 'prefer', 'would like to', 'plan to'], 'rewrite_position': 'replace', 'brief_feedback': None, 'longer_feedback': None, 'color': '#8595a1'}, {'strategy_code': 'L2b', 'category_number': 2, 'semantic_anchor': 'All or Nothing Thinking / Overgeneralization', 'word

## FOR SAVING:

In [78]:
import datetime
ts = datetime.datetime.now().isoformat()

In [79]:
with open(ts+'dict.json', 'w', encoding='utf-8') as f:
    json.dump(agg, f, ensure_ascii=False, indent=4)