In [1]:
import pandas as pd


In [2]:
raw_files_directory = "raw_data/data_files/"
example_raw_file = "CLIMAT_RAW_200303.txt"

## Read stations list

In [3]:
stations_file = "stations_list_CLIMAT_data.txt"

In [4]:
!head -5 stations_list_CLIMAT_data.txt 

WMO-Station ID; StationName;                                Latitude; Longitude; Height; Country
01001;          Jan Mayen;                                     70.94;    -08.67;      9; Norway                                         
01005;          Isfjord Radio;                                 78.06;     13.63;      9; Norway                                         
01007;          Ny-Alesund;                                    78.92;     11.93;      8; Norway                                         
01008;          Svalbard;                                      78.25;     15.50;     27; Norway                                         


In [5]:
stations_df = pd.read_csv(stations_file, sep=';', header=0, encoding="ISO-8859-1")

In [6]:
stations_df.tail(3)

Unnamed: 0,WMO-Station ID,StationName,Latitude,Longitude,Height,Country
4396,99092,unknown station,,,,...
4397,99113,unknown station,,,,...
4398,,,,,,


In [7]:
stations_df = stations_df.drop([stations_df.shape[0]-1])

In [8]:
stations_df.tail(3)

Unnamed: 0,WMO-Station ID,StationName,Latitude,Longitude,Height,Country
4395,99090,unknown station,,,,...
4396,99092,unknown station,,,,...
4397,99113,unknown station,,,,...


In [9]:
stations_df.head(3)

Unnamed: 0,WMO-Station ID,StationName,Latitude,Longitude,Height,Country
0,1001,Jan Mayen,70.94,-8.67,9,Norway
1,1005,Isfjord Radio,78.06,13.63,9,Norway
2,1007,Ny-Alesund,78.92,11.93,8,Norway


In [10]:
stations_df.columns = [col.strip() for col in stations_df.columns]

In [11]:
stations_df.columns

Index(['WMO-Station ID', 'StationName', 'Latitude', 'Longitude', 'Height',
       'Country'],
      dtype='object')

In [12]:
for col in stations_df.columns:
    stations_df[col] = stations_df[col].str.strip()

In [13]:
stations_df.dtypes

WMO-Station ID    object
StationName       object
Latitude          object
Longitude         object
Height            object
Country           object
dtype: object

In [14]:
stations_df = stations_df.astype({'WMO-Station ID':int}).rename(columns={'WMO-Station ID':'IIiii'})
#, 'Latitude': float, 'Longitude': float, 'Height': float}) 

In [15]:
stations_df.dtypes

IIiii           int64
StationName    object
Latitude       object
Longitude      object
Height         object
Country        object
dtype: object

In [16]:
stations_df['Country'].unique()

array(['Norway', 'Sweden', 'Finland',
       'United Kingdom of Great Britain and N.-Ireland', 'Ireland',
       'Iceland', 'Greenland', 'Faroe Islands', 'Denmark', 'Netherlands',
       'Belgium', 'Luxembourg', 'Switzerland', 'France', 'Spain',
       'Gibraltar', 'Portugal', 'Cape Verde', 'Germany', 'Austria',
       'Czech Republic', 'Slovakia (Slovak. Rep.)', 'Poland', 'Hungary',
       'Slowenia', 'Croatia/Hrvatska', 'Serbia', 'Bosnia and Herzegowina',
       'Montenegro', 'Macedonia', 'Albania', 'Romania', 'Bulgaria',
       'Italy', 'Malta', 'Greece', 'Turkey', 'Cyprus',
       'Russian Federation', 'Estonia', 'Latvia', 'Lithuania', 'Belarus',
       'Kazakhstan', 'Ukraine', 'Moldova, Rep. Of', 'Kyrgyzstan',
       'Georgia', '', 'Azerbaijan', 'Armenia', 'Uzbekistan',
       'Turkmenistan', 'Tajikistan', 'Syrian Arab Rep.', 'Lebanon',
       'Israel', 'Jordan', 'Saudi Arabia', 'Bahrain', 'Qatar', 'Oman',
       'Kuwait', 'Yemen', 'Iraq', 'Iran (Islamic Rep. of)', 'Afghanistan',


## Read a sample data file

In [17]:
raw_files_directory = "raw_data/data_files/"
example_raw_file = "CLIMAT_RAW_202107.txt"

sample_raw_file_df = pd.read_csv(raw_files_directory + example_raw_file, sep=';', header=0, encoding="ISO-8859-1", usecols=["year", "month", "IIiii", "T", "Tn", "Tx"])

In [18]:
! head -3 raw_data/data_files/CLIMAT_RAW_202107.txt

year;month;IIiii;G1;Po;G1;P;G1;sn;T;st;G1;sn;Tx;sn;Tn;G1;e;G1;R1;Rd;nr;G1;S1;ps;G1;mp;mT;mTx;mTn;G1;me;mR;mS;G2;Yb;Yc;G2;Po;G2;P;G2;sn;T;st;G2;sn;Tx;sn;Tn;G2;e;G2;R1;nr;G2;S1;G2;YP;YT;YTx;G2;Ye;YR;YS;G3;T25;T30;G3;T35;T40;G3;Tn0;Tx0;G3;R01;R05;G3;R10;R50;G3;R100;R150;G3;s00;s01;G3;s10;s50;G3;f10;f20;f30;G3;V1;V2;V3;G4;sn;Txd;yx;G4;sn;Tnd;yn;G4;sn;Tax;yax;G4;sn;Tan;yan;G4;Rx;yr;G4;iw;fx;yfx;G4;Dts;Dgr;G4;iy;Gx;Gn
2021;07;01001;1;10095;2;10107;3;0;069;013;4;0;094;0;052;5;086;6;0005;0;02;;;;8;00;00;0;0;9;00;00;31;0;61;90;1;10106;2;10092;3;0;042;016;4;0;064;0;027;5;075;6;0048;09;;;8;00;00;00;9;00;00;//;0;00;00;1;00;00;2;00;00;3;02;00;4;00;00;5;00;00;;;;;;;8;11;00;00;9;00;00;12;0;0;090;18;1;0;030;02;2;0;134;09;3;0;009;02;4;0013;23;5;1;376;01;6;00;00;7;2;18;18
2021;07;01007;1;10048;2;10057;3;0;055;014;4;0;067;0;040;5;077;6;0009;1;02;;;;8;00;00;0;0;9;00;00;31;0;61;90;1;10114;2;10104;3;0;049;018;4;0;070;0;036;5;076;6;0023;06;;;8;14;14;14;9;14;14;;0;00;00;1;00;00;2;00;00;3;02;00;4;00;00;5;0

In [19]:
sample_raw_file_df = sample_raw_file_df.drop([sample_raw_file_df.shape[0]-1])

In [20]:
sample_raw_file_df.describe()

Unnamed: 0,month,IIiii,T,Tx,Tn
count,2779.0,2779.0,2684.0,2655.0,2647.0
mean,7.0,52951.400864,219.991058,275.133333,170.676993
std,0.0,29241.539207,73.647759,75.611954,74.709411
min,7.0,1001.0,0.0,2.0,0.0
25%,7.0,26864.0,177.0,234.0,124.0
50%,7.0,60429.0,226.0,285.0,170.0
75%,7.0,76141.0,272.0,322.0,228.0
max,7.0,98851.0,671.0,625.0,711.0


In [21]:
sample_raw_file_df.tail()

Unnamed: 0,year,month,IIiii,T,Tx,Tn
2774,2021,7.0,98646.0,293.0,326.0,260.0
2775,2021,7.0,98653.0,291.0,325.0,239.0
2776,2021,7.0,98755.0,291.0,338.0,243.0
2777,2021,7.0,98836.0,291.0,340.0,242.0
2778,2021,7.0,98851.0,266.0,309.0,222.0


In [22]:
sample_raw_file_df.columns

Index(['year', 'month', 'IIiii', 'T', 'Tx', 'Tn'], dtype='object')

In [23]:
sample_raw_file_df['IIiii'].nunique()

2779

In [24]:
sample_raw_file_df.dtypes

year      object
month    float64
IIiii    float64
T        float64
Tx       float64
Tn       float64
dtype: object

In [25]:
sample_raw_file_df = sample_raw_file_df.astype({'IIiii':int, 'year': int, 'month': int}) 

## Check data for some places manually

In [26]:
stations_df.columns

Index(['IIiii', 'StationName', 'Latitude', 'Longitude', 'Height', 'Country'], dtype='object')

In [27]:
stations_df.dtypes

IIiii           int64
StationName    object
Latitude       object
Longitude      object
Height         object
Country        object
dtype: object

In [29]:
stations_df.shape

(4398, 6)

In [31]:
stations_df['IIiii'].unique()

array([ 1001,  1005,  1007, ..., 99090, 99092, 99113])

In [32]:
stations_df[stations_df['IIiii']==1001]

Unnamed: 0,IIiii,StationName,Latitude,Longitude,Height,Country
0,1001,Jan Mayen,70.94,-8.67,9,Norway


In [34]:
stations_df[stations_df['Country']=='India']

Unnamed: 0,IIiii,StationName,Latitude,Longitude,Height,Country
1520,42027,Srinagar,34.08,74.83,1587,India
1521,42034,Leh,34.15,77.57,3513,India
1522,42071,Amritsar,31.71,74.8,234,India
1523,42099,Ludhiana,30.93,75.87,247,India
1524,42131,Hissar,29.17,75.73,210,India
1525,42147,Mukteshwar,29.47,79.65,2311,India
1526,42165,Bikaner,28.0,73.3,224,India
1527,42182,New Delhi/Safdarjung Airp.,28.58,77.2,216,India
1528,42260,Agra,27.15,77.97,169,India
1529,42261,Agra,27.17,78.03,169,India


In [35]:
delhi_station_id = 42182
leh_station_id = 42034

In [41]:
sample_raw_file_df[sample_raw_file_df['IIiii'].isin([delhi_station_id, leh_station_id, 42027])]

Unnamed: 0,year,month,IIiii,T,Tx,Tn
1052,2021,7,42027,247.0,300.0,195.0
1056,2021,7,42182,314.0,365.0,264.0


In [39]:
sample_raw_file_df['IIiii'].nunique()

2779

In [40]:
stations_df['IIiii'].nunique()

4398