In [7]:
import pandas as pd
import numpy as np
import math
import os

def printCategory(category_dict):
    for category in category_dict:
        print(category)
        if category_dict[category] is not None:
            sub_dic = category_dict[category]
            printCategory(sub_dic)
            
def createLevelColumns(df, levels, new_columns):
    for i in range(len(df)):
        
        df_level1, df_level2 = "NULL","NULL"
        
        df_level1, nextLevels = findNotNullLevel(df, i, levels)
        if nextLevels is not None:
            df_level2, nextLevels = findNotNullLevel(df, i, nextLevels)
        
        most_specific_category = "NULL"
        
        if df_level2 != "NULL":
            df_level2 = df_level2 + "__" + df_level1
        
        if df_level2 != "NULL":
            most_specific_category = df_level2
        elif df_level1 != "NULL":
            most_specific_category = df_level1
        
        df.iloc[i, df.columns.get_loc('level1')] = df_level1
        df.iloc[i, df.columns.get_loc('level2')] = df_level2
        df.iloc[i, df.columns.get_loc('mostSpecificCategory')] = most_specific_category
        
def findNotNullLevel(df, i, levels):
    for level in levels:
        if df.iloc[i][level] == "NULL":
            continue
        else:
            return df.iloc[i][level], levels[level]
    return "NULL", None

def unionTwoLists(list1, list2):
    for category in list1:
        if category not in list2:
            list2.append(category)
    return list2

def checkNULL(checked_list):
    for item in checked_list:
        if item == "NULL":
            print("Contains NULL")
            return
    print("Not contains NULL")

In [29]:
path = os.path.abspath(os.getcwd())
df = pd.read_csv(path + "/Data2/Natural Disasters copy.csv")
df = df.dropna(axis = 0, how = 'all')
print(df.columns)

Index(['村志代码 Gazetteer Code', '村志书名 Gazetteer Title',
       '自然灾害种类 Types of Natural Disasters', '年份 Years', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')


In [41]:
new_columns = ['categoryId']
for column in new_columns:
    df[column] = None
df = df.where(df.notnull(), "NULL")

total_categories = [category for category in df['自然灾害种类 Types of Natural Disasters'].astype('category').unique()]
total_categories.sort()
checkNULL(total_categories)
# create dict "dic_category_id" store { category_name : id}
dic_category_id = {}
count = 1
for category in total_categories:
    if category != "NULL" and category not in dic_category_id:
        dic_category_id[category] = count
        count = count + 1

Not contains NULL


In [43]:
# creat categoryId column at dataframe
df_categoryId = []
for i in range(len(df)):
    category = df.iloc[i]['自然灾害种类 Types of Natural Disasters']
    if category in dic_category_id:
        df_categoryId.append(dic_category_id[category])
    else:
        print("Not recorded category for entity " + str(i))
        break;
df['categoryId'] = df_categoryId

In [44]:
# create economy_df
natural_disasters_df = pd.DataFrame(columns = ['gazetteerId', 'categoryId', 'year'])
years_heading = list(df.columns)[3:25]
dic_for_natural_disasters_df = {'gazetteerId':[], 'categoryId':[], 'year':[]}


# Process data for recorded year
for i in range(len(df)):# each row
    for year in years_heading: # recorded years
        if df.iloc[i][year] != "NULL":
            dic_for_natural_disasters_df['gazetteerId'].append(df.iloc[i]['村志代码 Gazetteer Code'])
            dic_for_natural_disasters_df['categoryId'].append(df.iloc[i]['categoryId'])
            dic_for_natural_disasters_df['year'].append(df.iloc[i][year])

# store data in new df
for attribute in natural_disasters_df.columns:
    natural_disasters_df[attribute] = dic_for_natural_disasters_df[attribute]
    

natural_disasters_df.head()

Unnamed: 0,gazetteerId,categoryId,year
0,1,7,1959
1,1,7,1988
2,1,7,1991
3,1,2,2002
4,1,8,1997


In [48]:
# create economyCategory_df
category_df = pd.DataFrame(columns = ['id', 'name', 'parentId'])
dic_for_category_df = {'id':[], 'name':[], 'parentId':[]}

for category in dic_category_id:
    child_parent = category.split('__', 1)
    name = child_parent[0]
    if len(child_parent) == 1:
        dic_for_category_df['id'].append(dic_category_id[category])
        dic_for_category_df['name'].append(name)
        dic_for_category_df['parentId'].append("NULL")
    else:
        parentId = dic_category_id[child_parent[1]]
        dic_for_category_df['id'].append(dic_category_id[category])
        dic_for_category_df['name'].append(name)
        dic_for_category_df['parentId'].append(parentId)
        
for attribute in category_df.columns:
    category_df[attribute] = dic_for_category_df[attribute]
len(category_df)

19

In [49]:
category_df.head()

Unnamed: 0,id,name,parentId
0,1,冰凌 Icestorm,
1,2,冰雹 Hailstorm,
2,3,台风 Typhoon,
3,4,地震 Earthquake,
4,5,寒潮 Cold Wave,


In [26]:
natural_disasters_df.to_csv('natural_disasters_df.csv', index = False, na_rep = "NULL")

In [27]:
naturalDisastersCategory_df.to_csv('naturalDisastersCategory_df.csv', index = False, na_rep = "NULL")