# **Processing manually extracted files from the data bucket -- discontinued**

Here, we import the .json-formatted data and, as a first plan, convert it to a pandas DataFrame. Later, tensors might be useful considering the high-dimensionality of the data, but it may be overkill for now

First, we import relevant packages

In [11]:
import os
from typing import List, Dict, Any, Tuple
from datetime import datetime
from exceptions import MissingKeyError

import json
import pandas as pd

For now, we only have some manually downloaded files from the Google Cloud bucket. Thus, we only access one folder. Later, we'll add an automated parsing function

In [12]:
def read_file(file_path : str) -> Dict:
    """
    Reads a .json file and return its content as a dictionary

    :param file_path: path to the file
    :return: dictionary with the content of the file
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            
            # Add the file path to the data, so we can track the source of the data later
            data['path'] = file_path
            return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return {}
    except json.JSONDecodeError:
        print(f"Error decoding JSON from file: {file_path}")
        return {}
    

def get_file_contents(file_paths : List[str]) -> List[Dict]:
    """
    Reads the content of the files and returns it as a list of dictionaries

    :param file_paths: list of paths to the files
    :return: list of dictionaries with the content of the files
    """
    return [read_file(file_path) for file_path in file_paths]


def generate_path(folder_path: str, file_name: str) -> str:
    """
    Generates the path to the file

    :param folder_path: path to the folder
    :param file_name: name of the file
    :return: path to the file
    """
    return os.path.join(folder_path, file_name)


def generate_file_paths(folder_path: str, file_names: List[str]) -> List[str]:
    """
    Generates the paths to the files

    :param folder_path: path to the folder
    :param file_names: list of file names
    :return: list of paths to the files
    """
    return [generate_path(folder_path, file_name) for file_name in file_names]


def get_file_names(folder_path : str) -> List[str]:
    """
    Return a list of filenames within given directory

    :param file_path: path to the directory
    :return: list of filenames
    """
    try:
        return os.listdir(folder_path)
    except FileNotFoundError:
        print(f"Directory not found: {folder_path}")
        return []


def get_file_dictionaries(folder_path : str) -> List[Dict]:
    """
    Reads the content of the files and returns it as a list of dictionaries

    :param folder_path: path to the directory containing the files
    :return: list of dictionaries with the content of the files
    """
    return get_file_contents(generate_file_paths(folder_path, get_file_names(folder_path)))

Now, we only need a path to a folder and we can extract all .json files in there and convert them to a list of dictionaries

In [13]:
path_manual_data = "../../data/floods-data_manual/2022/01/25/00/00"
dicts_manual_data = get_file_dictionaries(path_manual_data)

print(type(dicts_manual_data))
print(len(dicts_manual_data))
print(dicts_manual_data[0])

<class 'list'>
423


Next, we extract the dictionaries into a pandas DataFrame

In [128]:
def check_required_keys(d: Dict, file: str, required_keys: List[str]) -> None:
    """
    Checks if the required keys are present in the dictionary

    :param d: Dictionary to check
    :param file: Filename or path for error reporting
    :param required_keys: List of required keys
    :raises MissingKeyError: If any required key is missing
    """
    for key in required_keys:
        if key not in d:
            raise MissingKeyError(key, file)


def extract_dictionary_data(d: Dict) -> Dict:
    """
    Extracts the data from the dictionary

    :param dictionary: dictionary with the data
    :return: Transformed dictionary
    """
    alert_metadata = d.get('alertMetadata', {})
    water_level_thresholds = alert_metadata.get('waterLevelThresholds', {})

    check_required_keys(d, d['path'], ['issueTimestampSeconds', 'expirationTimestampSeconds', 'hasFlooding'])
    check_required_keys(alert_metadata, d['path'], ['district', 'gaugeStationName', 'stationId', 'publisherName', 'river', 'state'])
    check_required_keys(water_level_thresholds, d['path'], ['warningLevel', 'dangerLevel', 'extremeDangerLevel'])

    # If all is well, we can return the dictionary. Else, the Exception will be raised
    return {
        
        # Convert Unix timestamp to DateTime
        'DateTime': datetime.fromtimestamp(int(d['issueTimestampSeconds'])),
        'district': alert_metadata['district'],
        'gaugeStationName': alert_metadata['gaugeStationName'],
        'stationId': alert_metadata['stationId'],
        'publisherName': alert_metadata['publisherName'],
        'river': alert_metadata['river'],
        'state': alert_metadata['state'],
        'warningLevel': alert_metadata['waterLevelThresholds']['warningLevel'],
        'dangerLevel': alert_metadata['waterLevelThresholds']['dangerLevel'],
        'extremeDangerLevel': alert_metadata['waterLevelThresholds']['extremeDangerLevel'],

        # Same conversion for expiration timestamp
        'expirationTimestamp': datetime.fromtimestamp(int(d['expirationTimestampSeconds'])),
        'hasFlooding': d['hasFlooding']
    }


def create_dataframe_from_dicts(dicts: List[Dict]) -> pd.DataFrame:
    """
    Creates a DataFrame from a list of dictionaries and sets a DateTime index

    :param dicts: List of dictionaries
    :return: DataFrame with DateTime index
    """
    data = []
    for d in dicts:
        try:
            data.append(extract_dictionary_data(d))
        except MissingKeyError as e:
            print(e)
    
    return pd.DataFrame(data)


def convert_columns_to_string(df):
    """
    Converts all columns of type 'object' in the DataFrame to string

    :param df: DataFrame to convert
    :return: Modified DataFrame
    """
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype('str')
    return df


def modify_column_strings(s: str) -> str:
    """
    Capitalizes each alphabetic sequence in a string. Non-alphabetic
    characters are treated as delimiters. Newlines are replaced with
    spaces, and spaces are added before and after each slash '/'.

    :param s: Input string
    :return: Capitalized string
    """
    result = []
    capitalize_next = True

    for char in s:
        if char.isalpha():
            if capitalize_next:
                result.append(char.upper())
                capitalize_next = False
            else:
                result.append(char.lower())
        else:
            if char == '\n':
                result.append(' ')
            elif char == '/':
                result.append(' / ')
            else:
                result.append(char)
            capitalize_next = True

    return ''.join(result)


def modify_string_columns(df: pd.DataFrame, cols: str) -> pd.DataFrame:
    """
    Converts all columns of type 'object' (strings) in the DataFrame to lowercase

    :param df: DataFrame to convert
    :return: Modified DataFrame
    """
    for column in df.columns:
        print(df[column].dtype)
        if column in cols:
            df[column] = df[column].astype(str).apply(modify_column_strings)
    return df


def tidy_up_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Tidies up the DataFrame by doing various modifications

    :param df: DataFrame to tidy up
    :return: Tidied up DataFrame
    """
    # These are the columns we want to convert to lowercase because of
    # inconsistencies in their capitalization
    to_be_modified_string_columns = [
        'district',
        'gaugeStationName',
        'river',
        'state'
    ]

    df.set_index('DateTime', inplace = True)
    df.sort_index(inplace = True)
    # df = convert_columns_to_string(df)
    df = modify_string_columns(df, to_be_modified_string_columns)

    return df

In [129]:
df_manual_data = create_dataframe_from_dicts(dicts_manual_data)

Missing key 'state' in file '../../data/floods-data_manual/2022/01/25/00/00\2022_01_25_00_00_01c33b58fa4a4ccf8c994ea267a6acc7_01c33b58fa4a4ccf8c994ea267a6acc7.json'
Missing key 'publisherName' in file '../../data/floods-data_manual/2022/01/25/00/00\2022_01_25_00_00_03096dd7ff66423f86f064ec650e2aae_03096dd7ff66423f86f064ec650e2aae.json'
Missing key 'publisherName' in file '../../data/floods-data_manual/2022/01/25/00/00\2022_01_25_00_00_04371a70d79e40439a076b4ce47ee995_04371a70d79e40439a076b4ce47ee995.json'
Missing key 'extremeDangerLevel' in file '../../data/floods-data_manual/2022/01/25/00/00\2022_01_25_00_00_045a6d8b77a44ce68acca83057151a98_045a6d8b77a44ce68acca83057151a98.json'
Missing key 'district' in file '../../data/floods-data_manual/2022/01/25/00/00\2022_01_25_00_00_0a1729335b954a7c8d36181d3f929fad_0a1729335b954a7c8d36181d3f929fad.json'
Missing key 'publisherName' in file '../../data/floods-data_manual/2022/01/25/00/00\2022_01_25_00_00_0afbbf7bfdce485bafcf24056819ac1a_0afbbf7bf

In [130]:
df_manual_data = tidy_up_dataframe(df_manual_data)

object
object
object
object
object
object
float64
float64
float64
datetime64[ns]
bool


In [131]:
print(df_manual_data.dtypes)

district                       object
gaugeStationName               object
stationId                      object
publisherName                  object
river                          object
state                          object
dangerLevel                   float64
extremeDangerLevel            float64
expirationTimestamp    datetime64[ns]
hasFlooding                      bool
dtype: object


In [134]:
df_manual_data.head(3)

Unnamed: 0_level_0,district,gaugeStationName,stationId,publisherName,river,state,warningLevel,dangerLevel,extremeDangerLevel,expirationTimestamp,hasFlooding
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-01-25 01:24:11,Dibrugarh,Dillighat,014-UBDDIB,"B&BBO, Shillong",Brahmaputra / Desang,Assam,128.0,129.0,129.65,2022-01-27 01:24:11,False
2022-01-25 01:24:11,Mulugu,Eturunagaram,015-LGDHYD,"KGBO, Hyderabad",Godavari,Andhra Pradesh,73.32,75.82,77.66,2022-01-27 01:24:11,False
2022-01-25 01:24:11,Mayurbhanj,Baripada,004-ERDBWN,Mahanadi & Eastern Rivers,Burhabalang,Orissa,29.7,30.92,34.82,2022-01-27 01:24:11,False


In [135]:
df_manual_data.tail(3)

Unnamed: 0_level_0,district,gaugeStationName,stationId,publisherName,river,state,warningLevel,dangerLevel,extremeDangerLevel,expirationTimestamp,hasFlooding
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-01-25 01:28:10,Sitapur,Batpurwaghat,006-mgd2lkn,"UGBO, Lucknow",Ganga / Gomti,Uttar Pradesh,113.3,114.3,115.55,2022-01-27 01:28:10,False
2022-01-25 01:28:29,Purba Champaran,Chatia,010-mgd4ptn,LGBO Patna,Ganga / Gandak,Bihar,68.15,69.15,70.04,2022-01-27 01:28:29,False
2022-01-25 01:28:46,Kanyakumari,Ashramam,023-SWRDKOCHI,"C&SRO, CWC Coimbatore",Pazhayar,Tamilnadu,4.5,6.0,7.1,2022-01-27 01:28:46,False


To get to know the spatial coverage of the data, we make an overview of all the districts, rivers, and states present in the data by ...

Conclusion: These are Indian data, not African. Adding coordinates and all will thus come later, including mapping.

In [143]:
def get_unique_values(df, column_name):
    """
    Returns a list of unique values in the column

    :param df: DataFrame
    :param column_name: Name of the column
    :return: List of unique values
    """
    unique_values = df[column_name].unique().tolist()
    return unique_values


def print_list(l: List) -> None:
    """
    Prints a list of items

    :param l: List of items
    """
    for i in l:
        print(i)


def print_unique_col_vals(df, col):
    """
    Prints unique values in a column

    :param df: DataFrame
    :param col: Column name
    """
    print_list(sorted(get_unique_values(df, col)))

In [144]:
print_unique_col_vals(df_manual_data, 'district')

Adilabad
Agra
Ahmadnagar
Ahmedabad
Almora
Anantnag
Araria
Auraiya
Ayodhya
Bahraich
Baksa
Balasore
Baleshwar
Ballia
Balrampur
Barabanki
Baramula
Barpeta
Bastar
Basti
Bhadradri
Bhadrak
Bhagalpur
Bhandara
Bharuch
Bhupalpally
Bijapur
Birbhum
Budaun
Buldana
Buxar
Cachar
Chamoli
Chandrapur
Cuttack
Dadra And Nagar Haveli
Dakshin Kannada
Darbhanga
Datia
Dehradun
Deoria
Dhalai
Dholpur
Dhubri
Dhule
Dibrugarh
East Godavari
East Siang
Ernakulam
Erode
Etawah
Farrukhabad
Gajapati
Ganjam
Gaya
Ghaziabad
Ghazipur
Goalpara
Golaghat
Gonda
Gopalganj
Gorakhpur
Hailakandi
Hamirpur
Haridwar
Hoshangabad
Idukki
Jagatsinghapur
Jajapur
Jalaun
Jalpaiguri
Jaunpur
Jehanabad
Jhansi
Jorhat
Kamrup
Kannur
Kanpur Dehat
Kanpur Nagar
Kanyakumari
Karnal
Karur
Katihar
Kendujhar
Khagaria
Khammam
Kheda
Kheri
Kishanganj
Kochbihar
Kokrajhar
Kolhapur
Kollam
Kota
Kottayam
Kozhikode
Krishna
Kurnool
Kushinagar
Lakhimpur
Lohit
Lucknow
Madhubani
Maharajganj
Malappuram
Mandla
Marigaon
Mathura
Mayurbhanj
Medinipur
Mirzapur
Moradabad
Mu

In [147]:
print_unique_col_vals(df_manual_data, 'state')

Andhra Pradesh
Arunachal Pradesh
Assam
Bihar
Chhattishgarh
D.&Nh
Delhi
Gujarat
Haryana
Himachal Pradesh
Jammu & Kashmir
Jharkhand
Karnataka
Kerala
Madhya Pradesh
Maharashtra
Odisha
Orissa
Rajasthan
Tamilnadu
Tripura
Uttar Pradesh
Uttarakhand
West Bengal
West Sikkim


In [146]:
print_unique_col_vals(df_manual_data, 'river')

Aghanashini
Baitarni
Banas
Barak
Barak / Katakhal
Bav
Bhagirathi / Ajoy
Bharathapuzha
Bhogeswari
Brahmani
Brahmaputra
Brahmaputra /  Brahmaputra
Brahmaputra /  Buridehing
Brahmaputra /  Desang
Brahmaputra /  Dhansari(South)
Brahmaputra /  Dhansiri(South)
Brahmaputra /  Dikhow
Brahmaputra /  Jiabharali
Brahmaputra /  Kopili
Brahmaputra /  Lohit
Brahmaputra /  Noa-Dehing
Brahmaputra /  Ranganadi
Brahmaputra /  Siang
Brahmaputra /  Subansiri
Brahmaputra /  Teesta
Brahmaputra /  Torsa
Brahmaputra / Beki
Brahmaputra / Gaurang
Brahmaputra / Jaldhaka
Brahmaputra / Manas
Brahmaputra / Pagladiya
Brahmaputra / Puthimari
Brahmaputra / Sonkosh
Brahmaputra / Torsa / Raidak-I
Burhabalang
Cauvery
Cauvery / Amaravathi
Cauvery / Bhavani
Chaliyar
Chambal
Damanganga
Damanganga / Wagh
Ganga
Ganga / Alaknanda
Ganga / Alaknanda / Pinder
Ganga / Bhagirath
Ganga / Burhi Gandak
Ganga / Gandak
Ganga / Ghaghra
Ganga / Ghaghra / Kwano
Ganga / Ghaghra / Little Gandak
Ganga / Ghaghra / Rapti
Ganga / Ghaghra / Rapti

In [148]:
print_unique_col_vals(df_manual_data, 'gaugeStationName')

Abu Road
Addoor
Agra (J.B.)
Ahirwalia (Seasonal)
Akhuapada
Alipingal
Anandapur
Ankinghat
Anna Purna Ghat
Arangaly
Araria (Seasonal)
Arjunwad (Seasonal)
Ashramam
Auralya
Avanigadda
Ayilam
Ayodhya
B.K. Ghat
Badar Pur Ghat
Badatighat
Badlapur
Ballia
Balrampur
Baltara
Bamni
Banda
Bani
Bansi
Bantwal
Baripada
Basti
Basua
Batpurwaghat
Beki Road Bridge
Belonia
Benibad
Bhadrachalam
Bhagalpur
Bhandara
Bharuch
Bhikiasen
Bihubar
Birdghat
Burhanpur
Buxar
Chandradeepghat
Chatia
Chenimari
Chhapra (Seasonal)
Chhatang Allahabad
Chillaghat
Chinturu
Chopan
Chouldhowaghat
Colonelganj
Dabri
Dalmau
Daltenganj
Darauli
Delhi Rly Bridge
Deongaon Bridge
Dhansa
Dharamtul
Dheng Bridge
Dhengraghat
Dholabazar
Dholpur
Dhond
Dhubri
Dibrugarh
Dighaghat (Seasonal)
Dillighat
Domohani
Dowlaiswaram
Dumariaghat
Dummugudem
Ekmighat
Elginbridge
Etawah
Eturunagaram
Fatehgarh
Gajaraia
Gandhighat
Gangahed W / L Station
Gangpur Siswan (Seasonal)
Garhamukteshwar
Garrauli
Garudeshwar
Gaya
Ghaighat
Ghazipur
Gheropara
Ghugumari
Gidh