In [1]:
import pandas as pd
from ast import literal_eval
from typing import List, Union, Dict
from collections import defaultdict
from IPython.core.display import HTML
from IPython.display import display
import re
import operator
from copy import copy
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [2]:
def flatten(t: List[List]) -> List:
    """flatten list of lists"""
    return [item for sublist in t for item in sublist]

def custom_eval(x):
    if str(x)=='nan':
        return {}
    if str(x)=='[None]':
        return {}
    if type(x)==list:
        return x
    if type(x) is dict:
        return x    
    else:
        return literal_eval(x)

def item2list(item):
    if type(item) is list:
        return list(set(item))
    else:
        return [item]

In [3]:
df_name = 'all_hum_data_21-12-2022.csv.gz'
data = pd.read_csv(df_name, compression = 'gzip', low_memory=False,
                 lineterminator='\n')

geo_location_df = pd.read_csv('tables/geo_locations.csv', usecols=['id', 'title'])
geo_locations_dict = dict(zip(geo_location_df['id'].tolist(), geo_location_df['title'].tolist()))

In [4]:
data.columns

Index(['entry_id', 'created_at', 'excerpt', 'analysis_framework_id', 'lead_id',
       'project_id', 'title', 'outputs'],
      dtype='object')

In [5]:
data['outputs'] = data['outputs'].progress_apply(custom_eval)
data['all'] = data['outputs'].apply(
    lambda x: [item2list(item) for item in list(x.values())]
)

100%|██████████| 337231/337231 [00:36<00:00, 9206.17it/s] 


In [6]:
###### final mapping sheet
ordered_columns = ['Original first level',
       'Original second level', 'Original third level', 'NLP Type', 'NLP first level',
       'NLP second level', 'NLP third level', 'NLP fourth level']

hum_mapping_sheet = mapping_sheet_primary_tags = pd.read_csv(
    "tables/mapping_sheet_second_version_tmp.csv"
).drop(columns=["Reversible", "Remarks"])[ordered_columns].reset_index(drop=True)
original_sheet = hum_mapping_sheet.copy()

In [7]:
original_levels_cols = [
    "Original first level",
    "Original second level",
    "Original third level",
]

mapped_levels_cols = [
    "NLP first level",
    "NLP second level",
    "NLP third level",
]

for one_level in original_levels_cols:
    hum_mapping_sheet[one_level] = hum_mapping_sheet[one_level].apply(
        lambda x: x.lower().replace("\t", "").replace("•", "").strip()
        if type(x) is str
        else x
    )

for one_level in mapped_levels_cols:
    hum_mapping_sheet[one_level] = hum_mapping_sheet[one_level].apply(
        lambda x: x.capitalize().strip() if type(x) is str else x
    )

hum_mapping_sheet.drop_duplicates(inplace=True)

hum_mapping_sheet.shape

(3128, 8)

In [8]:
NLP_TYPES = [
    "2D",
    "nan",
    "1D",
    "Sector",
    "DEMOGRAPHIC GROUPS",
    "SPECIFIC NEEDS GROUPS",
    "AFFECTED GROUPS",
    "SEVERITY",
    "RELIABILITY",
]

#TODO: review this and add demographic groups new level

def get_final_nlp_name(row: pd.Series):

    final_outputs_one_row = []

    if str(row["NLP Type"]) != "nan" and str(row["NLP first level"]) != "nan":

        final_str = row["NLP Type"].lower().replace("•", "").replace("\t", "").strip().replace(" ", "_")

        ### affected groups
        if final_str == "affected_groups":
            if row["NLP first level"] == "Affected" and str(row['NLP second level'])!='nan':

                final_outputs_one_row.append(
                    f"first_level_tags->Affected->{row['NLP second level'].capitalize()}"
                )
                if str(row['NLP third level'])!='nan':
                    final_outputs_one_row.append(
                        f"secondary_tags->{row['NLP second level'].capitalize()}->{row['NLP third level'].capitalize()}"
                    )
                if str(row['NLP fourth level'])!='nan':
                    final_outputs_one_row.append(
                        f"secondary_tags->{row['NLP second level'].capitalize()}->{row['NLP fourth level'].capitalize()}"
                    )
                    
        ### severity and reliability and specific needs groups
        elif final_str in ["reliability", "severity", "specific_needs_groups"]:
            final_outputs_one_row.append(
                f"secondary_tags->{final_str}->{row['NLP first level'].capitalize()}"
            )

        ### demographic groups
        elif final_str == "demographic_groups":
            if str(row["NLP first level"]) != "nan":
                final_outputs_one_row.append(
                    f"secondary_tags->Gender->{row['NLP first level'].capitalize()}"
                )
            if str(row["NLP second level"]) != "nan":
                final_outputs_one_row.append(
                    f"secondary_tags->All->{row['NLP second level'].capitalize()}"
                )
            if str(row["NLP third level"]) != "nan":
                final_outputs_one_row.append(
                    f"secondary_tags->All->{row['NLP third level'].capitalize()}"
                )
            if str(row["NLP fourth level"]) != "nan":
                final_outputs_one_row.append(
                    f"secondary_tags->All->{row['NLP fourth level'].capitalize()}"
                )

        elif final_str == "sector":
            final_outputs_one_row.append(
                f"first_level_tags->sectors->{row['NLP first level'].capitalize()}"
            )
            if str(row["NLP second level"]) != "nan":
                final_outputs_one_row.append(
                    f"subsectors->{row['NLP first level'].capitalize()}->{row['NLP second level'].capitalize()}"
                )

        ### 1d and 2d subpillars
        elif final_str in ["1d", "2d"]:
            # sometimes only pillar
            if str(row["NLP second level"]) == "nan":
                final_outputs_one_row.append(
                    f"first_level_tags->pillars_{final_str[0]}d->{row['NLP first level'].capitalize()}"
                )
            else:
                final_outputs_one_row.append(
                    f"subpillars_{final_str[0]}d->{row['NLP first level'].capitalize()}->{row['NLP second level'].capitalize()}"
                )
                final_outputs_one_row.append(
                    f"first_level_tags->pillars_{final_str[0]}d->{row['NLP first level'].capitalize()}"
                )

        else:
            print(final_str)
            raise (Exception("problem!"))

    return final_outputs_one_row

In [9]:
hum_mapping_sheet['mapped_nlp'] = hum_mapping_sheet.apply(
    lambda x: get_final_nlp_name(x), axis=1
)

nlp_tags_mapping_sheet = sorted(list(set(flatten(hum_mapping_sheet['mapped_nlp']))))
len(nlp_tags_mapping_sheet)

179

In [10]:
mapping_widgets = [
    "matrix2dWidget",
    "multiselectWidget",
    "no_common_matrix2dWidget",
    "organigramWidget",
    "selectWidget",
    "raw",
    "scaleWidget",
    "no_common_multiselectWidget",
    "matrix1dWidget",
]

original_levels_cols = ['Original first level', 'Original second level', 'Original third level']
date_regex = re.compile(r"\d\d-\d\d-\d\d\d\d")

In [11]:
mapping_dict = defaultdict(list)# {one_kw: [f"first_level_tags->Affected->{one_kw.capitalize()}"] for one_kw in ['migrants', 'affected', 'non displaced', 'displaced']}

too_many_rows, no_mapping = set(), set()

nlp_all_outputs = []

for one_output in tqdm(data["outputs"].tolist()):

    nlp_one_output = defaultdict(list)

    ####### dates
    dates_tmp = []

    for one_date_widget_type in ["dateRangeWidget", "dateWidget"]:
        if one_date_widget_type in one_output:
            dates_tmp.extend(
                [
                    one_date
                    for one_date in one_output[one_date_widget_type]
                    if one_date is not None
                ]
            )

    ##### geo_locations
    if "geoWidget" in one_output:
        geo_location_output = [
            geo_locations_dict.get(one_loc_id) for one_loc_id in one_output["geoWidget"]
        ]
    else:
        geo_location_output = []

    ##### nlp mapping widgets
    for one_widget_type in mapping_widgets:
        if one_widget_type in one_output:
            outputs_one_widget = one_output[one_widget_type]
            """if type(outputs_one_widget[0]) is list:
                outputs_one_widget = flatten(outputs_one_widget)"""
            # print(outputs_one_widget)
            for item in outputs_one_widget:
                """item = (
                    str(raw_item)
                    .lower()
                    .replace("->->", "->")
                    .replace("-> ", "->")
                    .replace(" ->", "->")
                    .replace("->none", "")
                    .replace("->n/a", "")
                    .replace("\t", "")
                    .replace("•", "")
                    .replace("subpillars", "two_levels")
                    .replace("subsectors", "two_levels")
                    .replace("sectors->", "")
                    .strip()
                )"""
                if not str(item) in ["nan", "none", "", "n/a"]:
                    if item.isdigit():
                        geo_location_output.append(geo_locations_dict.get(int(item)))

                    elif date_regex.match(item):
                        dates_tmp.append(item)

                    else:
                        all_items = item.strip().split("->")
                        """if len(all_items) == 2 and all_items[0] == "sectors":
                            nlp_one_output["nlp_tags"].append(
                                f"first_level_tags->{item}"
                            )

                        else:"""
                        last_item = all_items[-1]

                        if item not in mapping_dict:
                            if len(all_items) == 1:
                                # secondary tags or isolated items
                                mapping_row = hum_mapping_sheet[
                                    hum_mapping_sheet.apply(
                                        lambda x: any(
                                            [
                                                last_item == x[one_level_original]
                                                for one_level_original in original_levels_cols
                                            ]
                                        ),
                                        axis=1,
                                    )
                                ].copy()
                            # subpillars, subsectors
                            else:# len(all_items) == 2:
                                second_last_item = all_items[-2]
                                mapping_row = hum_mapping_sheet[
                                    hum_mapping_sheet.apply(
                                        lambda x: second_last_item
                                        == x["Original first level"]
                                        and last_item == x["Original second level"],
                                        axis=1,
                                    )
                                ].copy()
                            """else:
                                
                                
                                mapping_row = hum_mapping_sheet[
                                    hum_mapping_sheet.apply(
                                        lambda x: second_last_item
                                        == x["Original first level"]
                                        and last_item == x["Original second level"],
                                        axis=1,
                                    )
                                ].copy()"""

                            if len(mapping_row) == 1:
                                one_mapped_item = mapping_row.iloc[0]["mapped_nlp"]
                                
                                nlp_one_output["nlp_tags"].extend(one_mapped_item)
                                mapping_dict[item] = one_mapped_item

                            elif len(mapping_row) > 1:
                                all_mapped_nlp = (
                                    mapping_row["mapped_nlp"].apply(str).tolist()
                                )
                                if len(set(all_mapped_nlp)) == 1:
                                    one_mapped_item = mapping_row.iloc[0][
                                        "mapped_nlp"
                                    ]
                                    
                                    nlp_one_output["nlp_tags"].extend(
                                        one_mapped_item
                                    )
                                    mapping_dict[item] = one_mapped_item
                                else:
                                    first_level_mapped_row = hum_mapping_sheet[
                                        hum_mapping_sheet.apply(
                                            lambda x: x["Original first level"]
                                            == last_item
                                            and str(x["Original second level"])
                                            == "nan",
                                            axis=1,
                                        )
                                    ].copy()

                                    if len(first_level_mapped_row) == 1:
                                        one_mapped_item = (
                                            first_level_mapped_row.iloc[0][
                                                "mapped_nlp"
                                            ]
                                        )
                                        
                                        nlp_one_output["nlp_tags"].extend(
                                            one_mapped_item
                                        )
                                        mapping_dict[item] = one_mapped_item
                                    else:

                                        too_many_rows.add(item)
                                        mapping_dict[item] = "too_many_rows"
                            else:
                                no_mapping.add(item)
                                mapping_dict[item] = "no_mapping"

                        elif mapping_dict[item] not in [
                            "no_mapping",
                            "too_many_rows",
                        ]:
                            nlp_one_output["nlp_tags"].extend(mapping_dict[item])

    nlp_one_output["geo_location"] = [
        one_loc for one_loc in geo_location_output if one_loc is not None
    ]

    if len(dates_tmp) > 0:
        dates_output = dates_tmp[0]
    else:
        dates_output = "-"

    nlp_one_output["excerpt_date"] = dates_output

    nlp_one_output["nlp_tags"] = list(set(nlp_one_output["nlp_tags"]))

    nlp_all_outputs.append(nlp_one_output)

100%|██████████| 337231/337231 [02:33<00:00, 2193.96it/s] 


In [14]:
data['excerpt_date'] = [one_excerpt_tags['excerpt_date'] for one_excerpt_tags in nlp_all_outputs]
data['geo_location'] = [one_excerpt_tags['geo_location'] for one_excerpt_tags in nlp_all_outputs]
data['nlp_tags'] = [one_excerpt_tags['nlp_tags'] for one_excerpt_tags in nlp_all_outputs]

In [27]:
#too short excerpts -> noise
data = data[data.excerpt.apply(lambda x: len(str(x)))>25]    

#too many level 2 tags -> noise
data['second_level_tags'] = data['nlp_tags'].apply(lambda x: [item for item in x if any([kw in item for kw in ['subsectors', 'subpillars']])])
data = data[data['second_level_tags'].apply(lambda x: len(x)>5)]

In [25]:
all_tags = dict(Counter(flatten(data.nlp_tags)))
#sort descending value
all_tags = dict( sorted(all_tags.items(), key=operator.itemgetter(1),reverse=True))
len(all_tags)

179

In [None]:
#TODO: drop duplicates check preprocessing modelling

In [26]:
data.excerpt.value_counts()

[General household information]Two most common behaviours adapted to prevent COVID-19 spreading, as reported by households Stopping handshakes or physical contact (48%), Keeping distance from people (42%).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                26
 Désinfection des ménages des cas confirmés.                                                                                                                                                                                                                                           