In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

In [2]:
raw_files_directory = Path("raw_data/data_files/")
example_raw_file = "CLIMAT_RAW_200303.txt"

## Read stations list

In [3]:
stations_file = "stations_list_CLIMAT_data.txt"

In [4]:
!wc -l stations_list_CLIMAT_data.txt

    4399 stations_list_CLIMAT_data.txt


In [5]:
!tail -10 stations_list_CLIMAT_data.txt 

98748;          Cagayan de Oro;                                08.48;    124.65;      5; Philippines                                    
98755;          Hinatuan;                                      08.37;    126.34;      3; Philippines                                    
98836;          Zamboanga;                                     06.92;    122.06;      6; Philippines                                    
98851;          General Santos;                                06.06;    125.10;    133; Philippines                                    
99030;          unknown station;                                    ;          ;       ;                                                    
99080;          unknown station;                                    ;          ;       ;                                                    
99090;          unknown station;                                    ;          ;       ;                                                    
99092;         

In [6]:
!head -10 stations_list_CLIMAT_data.txt 

WMO-Station ID; StationName;                                Latitude; Longitude; Height; Country
01001;          Jan Mayen;                                     70.94;    -08.67;      9; Norway                                         
01005;          Isfjord Radio;                                 78.06;     13.63;      9; Norway                                         
01007;          Ny-Alesund;                                    78.92;     11.93;      8; Norway                                         
01008;          Svalbard;                                      78.25;     15.50;     27; Norway                                         
01025;          Tromso/Langnes;                                69.68;     18.91;      9; Norway                                         
01026;          Tromso;                                        69.65;     18.94;    114; Norway                                         
01028;          Bjornoya;                                      74.5

In [7]:
stations_df = pd.read_csv(stations_file, sep=';', encoding='unicode_escape')
stations_df

Unnamed: 0,WMO-Station ID,StationName,Latitude,Longitude,Height,Country
0,01001,Jan Mayen,70.94,-08.67,9,Norway
1,01005,Isfjord Radio,78.06,13.63,9,Norway
2,01007,Ny-Alesund,78.92,11.93,8,Norway
3,01008,Svalbard,78.25,15.50,27,Norway
4,01025,Tromso/Langnes,69.68,18.91,9,Norway
...,...,...,...,...,...,...
4394,99080,unknown station,,,,...
4395,99090,unknown station,,,,...
4396,99092,unknown station,,,,...
4397,99113,unknown station,,,,...


In [8]:
stations_df.columns = [col.strip() for col in list(stations_df.columns)]
stations_df.columns

Index(['WMO-Station ID', 'StationName', 'Latitude', 'Longitude', 'Height',
       'Country'],
      dtype='object')

In [9]:
stations_df.describe()

Unnamed: 0,WMO-Station ID,StationName,Latitude,Longitude,Height,Country
count,4399,4398,4398.0,4398.0,4398,4398
unique,4399,4261,4247.0,3839.0,1262,205
top,1001,unknown station,,,3,United States of America
freq,1,42,42.0,49.0,90,346


In [10]:
numerical_columns = ['Latitude', 'Longitude', 'Height']
for numerical_column in numerical_columns:
    stations_df[numerical_column] = pd.to_numeric(stations_df[numerical_column].str.strip())

stations_df.dtypes

WMO-Station ID     object
StationName        object
Latitude          float64
Longitude         float64
Height            float64
Country            object
dtype: object

In [11]:
stations_df.describe()

Unnamed: 0,Latitude,Longitude,Height
count,4349.0,4349.0,4349.0
mean,20.981856,15.188876,387.922281
std,33.372126,78.412486,598.065947
min,-90.0,-179.97,-350.0
25%,-2.17,-57.09,27.0
50%,31.33,14.65,136.0
75%,47.01,70.52,475.0
max,83.66,179.75,4670.0


In [12]:
stations_df.dtypes

WMO-Station ID     object
StationName        object
Latitude          float64
Longitude         float64
Height            float64
Country            object
dtype: object

In [13]:
string_columns = ['StationName', 'Country']
for string_column in string_columns:
    stations_df[string_column] = stations_df[string_column].str.strip()

In [14]:
stations_df['StationName'].unique()

array(['Jan Mayen', 'Isfjord Radio', 'Ny-Alesund', ..., 'Zamboanga',
       'General Santos', nan], dtype=object)

## Read a sample data file

In [15]:
! head -3 raw_data/data_files/CLIMAT_RAW_202107.txt

year;month;IIiii;G1;Po;G1;P;G1;sn;T;st;G1;sn;Tx;sn;Tn;G1;e;G1;R1;Rd;nr;G1;S1;ps;G1;mp;mT;mTx;mTn;G1;me;mR;mS;G2;Yb;Yc;G2;Po;G2;P;G2;sn;T;st;G2;sn;Tx;sn;Tn;G2;e;G2;R1;nr;G2;S1;G2;YP;YT;YTx;G2;Ye;YR;YS;G3;T25;T30;G3;T35;T40;G3;Tn0;Tx0;G3;R01;R05;G3;R10;R50;G3;R100;R150;G3;s00;s01;G3;s10;s50;G3;f10;f20;f30;G3;V1;V2;V3;G4;sn;Txd;yx;G4;sn;Tnd;yn;G4;sn;Tax;yax;G4;sn;Tan;yan;G4;Rx;yr;G4;iw;fx;yfx;G4;Dts;Dgr;G4;iy;Gx;Gn
2021;07;01001;1;10095;2;10107;3;0;069;013;4;0;094;0;052;5;086;6;0005;0;02;;;;8;00;00;0;0;9;00;00;31;0;61;90;1;10106;2;10092;3;0;042;016;4;0;064;0;027;5;075;6;0048;09;;;8;00;00;00;9;00;00;//;0;00;00;1;00;00;2;00;00;3;02;00;4;00;00;5;00;00;;;;;;;8;11;00;00;9;00;00;12;0;0;090;18;1;0;030;02;2;0;134;09;3;0;009;02;4;0013;23;5;1;376;01;6;00;00;7;2;18;18
2021;07;01007;1;10048;2;10057;3;0;055;014;4;0;067;0;040;5;077;6;0009;1;02;;;;8;00;00;0;0;9;00;00;31;0;61;90;1;10114;2;10104;3;0;049;018;4;0;070;0;036;5;076;6;0023;06;;;8;14;14;14;9;14;14;;0;00;00;1;00;00;2;00;00;3;02;00;4;00;00;5;0

In [16]:
example_raw_file = "CLIMAT_RAW_202107.txt"
sample_raw_file_df = pd.read_csv(raw_files_directory / example_raw_file, sep=';')

sample_raw_file_df

Unnamed: 0,year,month,IIiii,G1,Po,G1.1,P,G1.2,sn,T,...,iw,fx,yfx,G4.6,Dts,Dgr,G4.7,iy,Gx,Gn
0,2021,7.0,1001.0,1.0,10095.0,2.0,10107.0,3.0,0.0,69.0,...,1.0,376.0,1.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
1,2021,7.0,1007.0,1.0,10048.0,2.0,10057.0,3.0,0.0,55.0,...,1.0,151.0,51.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
2,2021,7.0,1008.0,1.0,10022.0,2.0,10056.0,3.0,0.0,62.0,...,1.0,150.0,30.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
3,2021,7.0,1025.0,1.0,10096.0,2.0,10107.0,3.0,0.0,117.0,...,1.0,190.0,15.0,,,,7.0,2.0,18.0,18.0
4,2021,7.0,1026.0,1.0,9968.0,2.0,10106.0,3.0,0.0,120.0,...,1.0,163.0,16.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2775,2021,7.0,98653.0,1.0,10035.0,2.0,10080.0,3.0,0.0,291.0,...,1.0,160.0,23.0,6.0,3.0,0.0,,,,
2776,2021,7.0,98755.0,1.0,10078.0,2.0,10082.0,3.0,0.0,291.0,...,1.0,120.0,4.0,6.0,10.0,0.0,,,,
2777,2021,7.0,98836.0,1.0,10081.0,2.0,10089.0,3.0,0.0,291.0,...,1.0,140.0,16.0,6.0,3.0,0.0,,,,
2778,2021,7.0,98851.0,1.0,9940.0,2.0,10089.0,3.0,0.0,266.0,...,1.0,120.0,28.0,6.0,7.0,0.0,,,,


In [17]:
sample_raw_file_df = sample_raw_file_df.dropna(subset=['IIiii'])
sample_raw_file_df['IIiii'] = sample_raw_file_df['IIiii'].astype(int)
sample_raw_file_df = sample_raw_file_df.rename(columns={'IIiii': 'WMO-Station ID'})
sample_raw_file_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_raw_file_df['IIiii'] = sample_raw_file_df['IIiii'].astype(int)


Unnamed: 0,year,month,WMO-Station ID,G1,Po,G1.1,P,G1.2,sn,T,...,iw,fx,yfx,G4.6,Dts,Dgr,G4.7,iy,Gx,Gn
0,2021,7.0,1001,1.0,10095.0,2.0,10107.0,3.0,0.0,69.0,...,1.0,376.0,1.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
1,2021,7.0,1007,1.0,10048.0,2.0,10057.0,3.0,0.0,55.0,...,1.0,151.0,51.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
2,2021,7.0,1008,1.0,10022.0,2.0,10056.0,3.0,0.0,62.0,...,1.0,150.0,30.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
3,2021,7.0,1025,1.0,10096.0,2.0,10107.0,3.0,0.0,117.0,...,1.0,190.0,15.0,,,,7.0,2.0,18.0,18.0
4,2021,7.0,1026,1.0,9968.0,2.0,10106.0,3.0,0.0,120.0,...,1.0,163.0,16.0,6.0,0.0,0.0,7.0,2.0,18.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2774,2021,7.0,98646,1.0,10052.0,2.0,10080.0,3.0,0.0,293.0,...,1.0,200.0,22.0,6.0,8.0,0.0,,,,
2775,2021,7.0,98653,1.0,10035.0,2.0,10080.0,3.0,0.0,291.0,...,1.0,160.0,23.0,6.0,3.0,0.0,,,,
2776,2021,7.0,98755,1.0,10078.0,2.0,10082.0,3.0,0.0,291.0,...,1.0,120.0,4.0,6.0,10.0,0.0,,,,
2777,2021,7.0,98836,1.0,10081.0,2.0,10089.0,3.0,0.0,291.0,...,1.0,140.0,16.0,6.0,3.0,0.0,,,,


In [18]:
column_names = sample_raw_file_df.columns
updated_column_names = []
for idx, col in enumerate(column_names):
    if col.startswith('sn'):
        col = 'sn_' + column_names[idx + 1]

    updated_column_names.append(col)
sample_raw_file_df.columns = updated_column_names
sample_raw_file_df.columns

Index(['year', 'month', 'WMO-Station ID', 'G1', 'Po', 'G1.1', 'P', 'G1.2',
       'sn_T', 'T',
       ...
       'iw', 'fx', 'yfx', 'G4.6', 'Dts', 'Dgr', 'G4.7', 'iy', 'Gx', 'Gn'],
      dtype='object', length=127)

In [19]:
relevant_columns = ["year", "month", "WMO-Station ID", "sn_T", "T", "st", "sn_Tn","Tn", "sn_Tx", "Tx"]
sample_raw_file_df = sample_raw_file_df[relevant_columns]
sample_raw_file_df

Unnamed: 0,year,month,WMO-Station ID,sn_T,T,st,sn_Tn,Tn,sn_Tx,Tx
0,2021,7.0,1001,0.0,69.0,13.0,0.0,52.0,0.0,94.0
1,2021,7.0,1007,0.0,55.0,14.0,0.0,40.0,0.0,67.0
2,2021,7.0,1008,0.0,62.0,14.0,0.0,45.0,0.0,83.0
3,2021,7.0,1025,0.0,117.0,30.0,0.0,94.0,0.0,150.0
4,2021,7.0,1026,0.0,120.0,36.0,0.0,90.0,0.0,159.0
...,...,...,...,...,...,...,...,...,...,...
2774,2021,7.0,98646,0.0,293.0,10.0,0.0,260.0,0.0,326.0
2775,2021,7.0,98653,0.0,291.0,5.0,0.0,239.0,0.0,325.0
2776,2021,7.0,98755,0.0,291.0,16.0,0.0,243.0,0.0,338.0
2777,2021,7.0,98836,0.0,291.0,,0.0,242.0,0.0,340.0


In [20]:
sample_raw_file_df.describe()

Unnamed: 0,month,WMO-Station ID,sn_T,T,st,sn_Tn,Tn,sn_Tx,Tx
count,2779.0,2779.0,2684.0,2684.0,2501.0,2649.0,2647.0,2658.0,2655.0
mean,7.0,52951.400864,0.027571,219.991058,24.805678,0.03473,170.676993,0.020692,275.133333
std,0.0,29241.539207,0.178994,73.647759,21.793646,0.233847,74.709411,0.144998,75.611954
min,7.0,1001.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,7.0,26864.0,0.0,177.0,15.0,0.0,124.0,0.0,234.0
50%,7.0,60429.0,0.0,226.0,21.0,0.0,170.0,0.0,285.0
75%,7.0,76141.0,0.0,272.0,28.0,0.0,228.0,0.0,322.0
max,7.0,98851.0,4.0,671.0,335.0,8.0,711.0,2.0,625.0


In [21]:
sample_raw_file_df.columns

Index(['year', 'month', 'WMO-Station ID', 'sn_T', 'T', 'st', 'sn_Tn', 'Tn',
       'sn_Tx', 'Tx'],
      dtype='object')

In [22]:
sample_raw_file_df['WMO-Station ID'].nunique()

2779

In [23]:
sample_raw_file_df.dtypes

year               object
month             float64
WMO-Station ID      int64
sn_T              float64
T                 float64
st                float64
sn_Tn             float64
Tn                float64
sn_Tx             float64
Tx                float64
dtype: object

In [24]:
sample_raw_file_df = sample_raw_file_df.astype({'year': int, 'month': int})
sample_raw_file_df.dtypes

year                int64
month               int64
WMO-Station ID      int64
sn_T              float64
T                 float64
st                float64
sn_Tn             float64
Tn                float64
sn_Tx             float64
Tx                float64
dtype: object

## Check data for some places manually

In [25]:
stations_df[stations_df['WMO-Station ID']==1001]

Unnamed: 0,WMO-Station ID,StationName,Latitude,Longitude,Height,Country


In [26]:
stations_df[stations_df['Country']=='India']

Unnamed: 0,WMO-Station ID,StationName,Latitude,Longitude,Height,Country
1520,42027,Srinagar,34.08,74.83,1587.0,India
1521,42034,Leh,34.15,77.57,3513.0,India
1522,42071,Amritsar,31.71,74.8,234.0,India
1523,42099,Ludhiana,30.93,75.87,247.0,India
1524,42131,Hissar,29.17,75.73,210.0,India
1525,42147,Mukteshwar,29.47,79.65,2311.0,India
1526,42165,Bikaner,28.0,73.3,224.0,India
1527,42182,New Delhi/Safdarjung Airp.,28.58,77.2,216.0,India
1528,42260,Agra,27.15,77.97,169.0,India
1529,42261,Agra,27.17,78.03,169.0,India


In [27]:
delhi_station_id = 42182
leh_station_id = 42034

In [28]:
sample_raw_file_df[sample_raw_file_df['WMO-Station ID'].isin([delhi_station_id, leh_station_id, 42027])]

Unnamed: 0,year,month,WMO-Station ID,sn_T,T,st,sn_Tn,Tn,sn_Tx,Tx
1052,2021,7,42027,0.0,247.0,25.0,0.0,195.0,0.0,300.0
1056,2021,7,42182,0.0,314.0,31.0,0.0,264.0,0.0,365.0


In [29]:
sample_raw_file_df['WMO-Station ID'].nunique()

2779

In [30]:
stations_df['WMO-Station ID'].nunique()

4399