## Install Libraries

In [10]:
! pip install pandas
! pip install requests



## Import Libraries

In [4]:
import pandas as pd
from ftplib import FTP
import requests   # more convenient for http(s) urls
import numpy as np

## Download Data

The "year".csv files contain all daily and station elements found in GHCN daily for the given year.  These 
files are updated daily for the entire period of record.

The following information serves as a definition of each field in one line of data covering one station-day. 
Each field described below is separated by a comma ( , ) and follows the order below:

ID = 11 character station identification code
YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986)
ELEMENT = 4 character indicator of element type 
DATA VALUE = 5 character data value for ELEMENT 
M-FLAG = 1 character Measurement Flag 
Q-FLAG = 1 character Quality Flag 
S-FLAG = 1 character Source Flag 
OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am)

See section III of the GHCN-Daily readme.txt file (ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)
for an explanation of ELEMENT codes and their units as well as the M-FLAG, Q-FLAG and S-FLAG.

The OBS-TIME field is populated with the observation times contained in NOAA/NCEI's HOMR station history database.  

In [18]:
url = "ftp://ftp.ncei.noaa.gov/pub/data/ghcn/daily"
site, fpath = url.split("//")[1].split("/", 1)
with FTP(site) as session:
    session.login()  # "anonymous", "your-email@uni-koeln.de")
    session.set_debuglevel(1)
    session.cwd(fpath)
    fname = "readme.txt"
    with open(f"./data/{fname}", 'wb') as local_file:
        session.retrbinary(f"RETR {fname}", local_file.write)

*cmd* 'CWD pub/data/ghcn/daily'
*resp* '250 CWD command successful'
*cmd* 'TYPE I'
*resp* '200 Type set to I'
*cmd* 'EPSV'
*resp* '229 Entering Extended Passive Mode (|||63021|)'
*cmd* 'RETR readme.txt'
*resp* '150 Opening BINARY mode data connection for readme.txt (28140 bytes)'
*resp* '226 Transfer complete'
*cmd* 'QUIT'
*resp* '221 Goodbye.'


In [5]:
years = [1949, 1950, 1951, 1952]

for year in years:
    print(f"...Downloading data from year {year}....")
    url = f"https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/{year}.csv.gz"
    response = requests.get(url)
    if response.ok:
        filename = url.rsplit('/', 1)[1]
        print(f"data downloaded. Will be saved as {filename}")
        with open(f"./data/weather/{filename}", "wb") as f:
            f.write(response.content)
    else:
        print("An error occured while trying to retrieve the data from the internet.")
    print(f"Data from year {year} downloaded and saved.")


...Downloading data from year 1949....
data downloaded. Will be saved as 1949.csv.gz
Data from year 1949 downloaded and saved.
...Downloading data from year 1950....
data downloaded. Will be saved as 1950.csv.gz
Data from year 1950 downloaded and saved.
...Downloading data from year 1951....
data downloaded. Will be saved as 1951.csv.gz
Data from year 1951 downloaded and saved.
...Downloading data from year 1952....
data downloaded. Will be saved as 1952.csv.gz
Data from year 1952 downloaded and saved.


## Process Data

### Stations Structure:
| Column name | Desc | Data type |
| --- | --- | --- |
| ID | 11 character station identification code | Character |
| YEAR/MONTH/DAY | 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986) | Character |
| ELEMENT  | 4 character indicator of element type  | Character |
| DATA VALUE | 5 character data value for ELEMENT | Character |
| M-FLAG | 1 character Measurement Flag | Character |
| Q-FLAG | 1 character Quality Flag | Character |
| S-FLAG | 1 character Source Flag | Character |
| OBS-TIME | 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am) | Character |


In [14]:
for year in years: 
    # hier nur drinne damit nicht direkt für alle Jahre gestartet wird
    if year == years[0]:
        print(f"Year {year} processing.")
        file_path = f'./data/weather/{year}.csv.gz'
        filename = f'./data/weather/{year}.csv.gz'

        columns = ["stationcode", "datelabel", "param", "value", "mflag", "qflag", "sflag", "time"]
        
        df = pd.read_csv(filename, names=columns, compression="gzip")

        # convert values to float
        df = df.astype({"value": "float32"})
        

        # cleanse dataset: keep only the parameters of interest, i.e. TMIN, TMAX, PRCP, SNOW
        keep = ["TMIN", "TMAX", "PRCP", "SNOW"]

        df = df[df["param"].isin(keep)]

        scaling_factors = {"TMIN": 0.1, "TMAX": 0.1, "PRCP": 0.1}

        for k, v in scaling_factors.items():
            df.loc[df["param"]==k,"value"] *= v

        # # df = df.replace(np.nan, '', regex=True)
        # df.replace(['None', 'nan'], np.nan, inplace=True)
        df = df.fillna('')

        print(df)

        print(f"Processing of year {year} finished.")

        # df.to_csv(f"./data/export/weather/modified_{year}.csv", index=False)

          stationcode  datelabel param      value mflag qflag sflag time
0         ACW00011604   19490101  TMAX  28.900000                 X     
1         ACW00011604   19490101  TMIN  21.700001                 X     
2         ACW00011604   19490101  PRCP   0.000000                 X     
3         ACW00011604   19490101  SNOW   0.000000                 X     
5         AG000060390   19490101  TMAX  16.600000                 G     
...               ...        ...   ...        ...   ...   ...   ...  ...
23787114  WZ004094600   19491231  PRCP   0.000000                 I     
23787115  WZ004451000   19491231  PRCP   0.000000                 I     
23787116  WZ004467410   19491231  PRCP  14.200000                 I     
23787117  WZ004822290   19491231  PRCP   0.000000                 I     
23787118  WZ004834260   19491231  PRCP  17.800001                 I     

[18089254 rows x 8 columns]
Processing of year 1949 finished.
          stationcode  datelabel param      value mflag qflag

In [15]:
df.head()

Unnamed: 0,Station_ID,Latitude,Longitude,Elevation,State,Station_Name,GSN_Flag,HCN_CRN_Flag,WMO_ID
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,,
2,AE000041196,25.333,55.517,34.0,,SHARJAH INTER. AIRP,GSN,,41196.0
3,AEM00041194,25.255,55.364,10.4,,DUBAI INTL,,,41194.0
4,AEM00041217,24.433,54.651,26.8,,ABU DHABI INTL,,,41217.0


In [16]:
df.describe()

Unnamed: 0,Latitude,Longitude,Elevation
count,125988.0,125988.0,125988.0
mean,25.741179,-44.342157,447.031776
std,28.323435,89.3985,656.340957
min,-90.0,-179.9947,-999.9
25%,20.919175,-100.54005,85.0
50%,36.91685,-84.7275,257.6
75%,42.82045,2.072075,596.8
max,83.65,179.7414,5033.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125988 entries, 0 to 125987
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Station_ID    125988 non-null  object 
 1   Latitude      125988 non-null  float64
 2   Longitude     125988 non-null  float64
 3   Elevation     125988 non-null  float64
 4   State         125988 non-null  object 
 5   Station_Name  125988 non-null  object 
 6   GSN_Flag      125988 non-null  object 
 7   HCN_CRN_Flag  125988 non-null  object 
 8   WMO_ID        125988 non-null  object 
dtypes: float64(3), object(6)
memory usage: 8.7+ MB


## Export dataframe to CSV file

In [18]:
df.to_csv("./data/export/weather/modified_y.csv", index=False)