In [3]:
import os
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 5)

# FOR DISTRICT

In [4]:
dir = os.getcwd()
#SPECS TO READ SUMMARY DATA
summary_specs = [(0,10),(10,20),(20,30),(30,40),(40,50),(50,60),(60,70),(70,80)]
summary_cols = ["year","mean_tmin","mean_tmax","mean_pre", "sum_pre", "min_tmin", "max_tmax","std_pre"]
summary_path = os.path.join(dir,"summary")
station_list_path = os.path.join(dir,"station list.csv")

In [5]:
district_format = '%10d%10.4f%10.4f%10.4f%10.2f%10.2f%10.2f'
district_specs = [(0,10),(10,20),(20,30),(30,40),(40,50),(40,50),(50,60)]
district_cols = ["year","mean_tmin","mean_tmax","mean_pre", "sum_pre", "min_tmin", "max_tmax"]
district_path = os.path.join(dir,"district_summary")

In [6]:
#READ DATA AND THE WEADTHER STATION
data_dict = {}
files = os.listdir(summary_path)
for file in files:
    data_path = os.path.join(summary_path,file)
    data_dict[file] =  pd.read_fwf(data_path, skiprows=1, skipfooter=0,\
                                   colspecs=summary_specs,names=summary_cols)

In [7]:
all_stations = os.listdir(summary_path)
#GET ALL THE STATION INFORMATION FROM LIST
station_list = pd.read_csv(station_list_path, sep=";" )
#FORMAT THE STATION LIST
station_list['index_no'] = station_list['index_no'].map(lambda x: '%04d'%x)
#STATION_INFO NOW CONTAINS ONLY STATIONS THAT WE HAVE DATA
station_info = station_list[station_list.index_no.isin(all_stations)]
#GET DISTRICT LIST:
district_list = station_info['district'].unique()

In [62]:
def process_data(joined_data):
    year = []
    mean_tmin = []
    mean_tmax = []
    mean_pre = []
    sum_pre = []
    min_tmin = []
    max_tmax = []
    #get year
    all_years =  joined_data.year.unique()
    #calculate
    for this_year in all_years:
        this_mean_tmin = joined_data[joined_data.year == this_year].mean_tmin.astype('float').mean()
        this_mean_tmax = joined_data[joined_data.year == this_year].mean_tmax.astype('float').mean()
        this_mean_pre = joined_data[joined_data.year == this_year].mean_pre.astype('float').mean()
        this_sum_pre = joined_data[joined_data.year == this_year].sum_pre.astype('float').sum()
        this_min_tmin = joined_data[joined_data.year == this_year].min_tmin.astype('float').min()
        this_max_tmax = joined_data[joined_data.year == this_year].max_tmax.astype('float').max()
        #append
        year.append(this_year)
        mean_tmin.append(this_mean_tmin)
        mean_tmax.append(this_mean_tmax)
        mean_pre.append(this_mean_pre)
        min_tmin.append(this_min_tmin)
        max_tmax.append(this_max_tmax)
        sum_pre.append(this_sum_pre)
    #summary
    output = pd.DataFrame({'year': year,'mean_tmin': mean_tmin, 'mean_tmax': mean_tmax,\
                           'mean_pre': mean_pre, 'sum_pre': sum_pre, 'min_tmin':min_tmin,\
                           'max_tmax': max_tmax})
    return output

def write_data(data, path, table_type, fmt, cols):
    np.savetxt(os.path.join(path, str(table_type)), data.values, fmt = fmt,\
           header = ','.join(cols)) 

In [63]:
def get_stats_for_district(district, data_dict = data_dict): #output is the dataframe of average temp and sum pre according to years
    #Get local station according to districts
    local_stations = station_info[station_info.district == district].index_no.tolist()
    #concat data 
    joined_data = pd.concat([data_dict[i] for i in local_stations[0:]]).sort_values(by = "year")  
    return process(joined_data, data_dict = data_dict)

for district in district_list:
    district_data = get_stats_for_district(district)
    write_data(district_data, district_path, district, district_format, district_cols)


# FOR REGION

In [18]:
district_region_path = os.path.join(dir,"District_region.csv")
region_format = '%10d%10.4f%10.4f%10.4f%10.2f%10.2f%10.2f'
region_specs = [(0,10),(10,20),(20,30),(30,40),(40,50),(40,50),(50,60)]
region_cols = ["year","mean_tmin","mean_tmax","mean_pre", "sum_pre", "min_tmin", "max_tmax"]
region_path = os.path.join(dir,"region_summary")

In [10]:
district_region = pd.read_csv(district_region_path, sep=";" )
district_region

Unnamed: 0,SN,District,Region,Hill,Moutain,Terai,Western,Middle,Eastern
0,1,Humla,Western Mountain,0,0,0,1,0,0
1,2,Darchula,Western Mountain,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
75,76,Morang,Eastern Terai,0,0,1,0,0,1
76,77,Jhapa,Eastern Terai,0,0,1,0,0,1


In [13]:
pd.set_option("display.max_rows", 5)
station_info = station_info.join(district_region.set_index("District"), on = "district")

In [70]:
def get_stats_for_region(region, data_dict = data_dict): #output is the dataframe of average temp and sum pre according to years
    #Get local station according to districts
    local_stations = station_info[station_info.Region == region].index_no.tolist()
    #concat data 
    joined_data = pd.concat([data_dict[i] for i in local_stations[0:]]).sort_values(by = "year")
    return process_data(joined_data)

In [50]:
region_list = station_info[station_info.Region.notnull()].Region.unique()
region_list

array(['Middle Hill', 'Western Hill', 'Eastern Hill', 'Eastern Mountain',
       'Eastern Terai', 'Western Mountain', 'Western Terai',
       'Middle Mountain', 'Middle Terai'], dtype=object)

In [64]:
for region in region_list:
    print(region)
    region_data = get_stats_for_region(region)
    write_data(region_data, region_path, region, region_format, region_cols)


Middle Hill
Western Hill
Eastern Hill
Eastern Mountain
Eastern Terai
Western Mountain
Western Terai
Middle Mountain
Middle Terai


# FOR SUBREGION

In [73]:
station_info[station_info.loc[:,"Hill"] == 1].loc[:,"index_no"]

4      1004
42     0104
       ... 
469    0809
473    0814
Name: index_no, Length: 13, dtype: object

In [76]:
subregion_path = os.path.join(dir,"subregion_summary")

In [74]:

def get_stats_for_subregion(subregion, data_dict = data_dict): #output is the dataframe of average temp and sum pre according to years
    #Get local station according to districts
    local_stations = station_info[station_info.loc[:,subregion] == 1].loc[:,"index_no"]
    #concat data 
    joined_data = pd.concat([data_dict[i] for i in local_stations[0:]]).sort_values(by = "year")
    return process_data(joined_data)

In [81]:
subregions =  ["Hill", "Mountain","Terai", "Eastern", "Western", "Middle"]
for subregion in subregions:
    print(subregion)
    subregion_data = get_stats_for_subregion(subregion)
    write_data(subregion_data, subregion_path, subregion, region_format, region_cols)

Hill
Mountain
Terai
Eastern
Western
Middle


In [79]:
def get_stats_for_country(data_dict = data_dict): #output is the dataframe of average temp and sum pre according to years
    #Get local station according to districts
    local_stations = station_info.index_no
    #concat data 
    joined_data = pd.concat([data_dict[i] for i in local_stations[0:]]).sort_values(by = "year")
    return process_data(joined_data)

In [82]:
country_data = get_stats_for_country()
write_data(country_data, subregion_path, "whole_country", region_format, region_cols)