## Install Libraries

In [1]:
! pip install pandas
! pip install requests
! pip install numpy



## Import Libraries

In [2]:
import pandas as pd
from ftplib import FTP
import requests   # more convenient for http(s) urls
import numpy as np
import os

## Download Data

The "year".csv files contain all daily and station elements found in GHCN daily for the given year.  These 
files are updated daily for the entire period of record.

The following information serves as a definition of each field in one line of data covering one station-day. 
Each field described below is separated by a comma ( , ) and follows the order below:

ID = 11 character station identification code
YEAR/MONTH/DAY = 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986)
ELEMENT = 4 character indicator of element type 
DATA VALUE = 5 character data value for ELEMENT 
M-FLAG = 1 character Measurement Flag 
Q-FLAG = 1 character Quality Flag 
S-FLAG = 1 character Source Flag 
OBS-TIME = 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am)

See section III of the GHCN-Daily readme.txt file (ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt)
for an explanation of ELEMENT codes and their units as well as the M-FLAG, Q-FLAG and S-FLAG.

The OBS-TIME field is populated with the observation times contained in NOAA/NCEI's HOMR station history database.  

In [3]:
url = "ftp://ftp.ncei.noaa.gov/pub/data/ghcn/daily"
site, fpath = url.split("//")[1].split("/", 1)
with FTP(site) as session:
    session.login()  # "anonymous", "your-email@uni-koeln.de")
    session.set_debuglevel(1)
    session.cwd(fpath)
    directory_path = "data"
    os.makedirs(directory_path, exist_ok=True)
    fname = "readme.txt"
    with open(f"./data/{fname}", 'wb') as local_file:
        session.retrbinary(f"RETR {fname}", local_file.write)

*cmd* 'CWD pub/data/ghcn/daily'
*resp* '250 CWD command successful'
*cmd* 'TYPE I'
*resp* '200 Type set to I'
*cmd* 'PASV'
*resp* '227 Entering Passive Mode (205,167,25,137,255,247).'
*cmd* 'RETR readme.txt'
*resp* '150 Opening BINARY mode data connection for readme.txt (28140 bytes)'
*resp* '226 Transfer complete'
*cmd* 'QUIT'
*resp* '221 Goodbye.'


In [4]:
years = [1949, 1950, 1951, 1952]

for year in years:
    print(f"...Downloading data from year {year}....")
    url = f"https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/{year}.csv.gz"
    response = requests.get(url)
    if response.ok:
        filename = url.rsplit('/', 1)[1]
        print(f"data downloaded. Will be saved as {filename}")
        directory_path = "data/weather"
        os.makedirs(directory_path, exist_ok=True)
        with open(f"./data/weather/{filename}", "wb") as f:
            f.write(response.content)
    else:
        print("An error occured while trying to retrieve the data from the internet.")
    print(f"Data from year {year} downloaded and saved.")


...Downloading data from year 1949....
data downloaded. Will be saved as 1949.csv.gz
Data from year 1949 downloaded and saved.
...Downloading data from year 1950....
data downloaded. Will be saved as 1950.csv.gz
Data from year 1950 downloaded and saved.
...Downloading data from year 1951....
data downloaded. Will be saved as 1951.csv.gz
Data from year 1951 downloaded and saved.
...Downloading data from year 1952....
data downloaded. Will be saved as 1952.csv.gz
Data from year 1952 downloaded and saved.


## Process Data

### Stations Structure:
| Column name | Desc | Data type |
| --- | --- | --- |
| ID | 11 character station identification code | Character |
| YEAR/MONTH/DAY | 8 character date in YYYYMMDD format (e.g. 19860529 = May 29, 1986) | Character |
| ELEMENT  | 4 character indicator of element type  | Character |
| DATA VALUE | 5 character data value for ELEMENT | Character |
| M-FLAG | 1 character Measurement Flag | Character |
| Q-FLAG | 1 character Quality Flag | Character |
| S-FLAG | 1 character Source Flag | Character |
| OBS-TIME | 4-character time of observation in hour-minute format (i.e. 0700 =7:00 am) | Character |


In [None]:
for year in years: 
    # hier nur drinne damit nicht direkt für alle Jahre gestartet wird
    if year == years[0]:
        print(f"Year {year} processing.")
        file_path = f'./data/weather/{year}.csv.gz'
        filename = f'./data/weather/{year}.csv.gz'

        columns = ["stationcode", "datelabel", "param", "value", "mflag", "qflag", "sflag", "time"]
        
        df = pd.read_csv(filename, names=columns, compression="gzip")

        # convert values to float
        df = df.astype({"value": "float32"})
        

        # cleanse dataset: keep only the parameters of interest, i.e. TMIN, TMAX, PRCP, SNOW
        keep = ["TMIN", "TMAX", "PRCP", "SNOW"]

        df = df[df["param"].isin(keep)]

        scaling_factors = {"TMIN": 0.1, "TMAX": 0.1, "PRCP": 0.1}

        for k, v in scaling_factors.items():
            df.loc[df["param"]==k,"value"] *= v

        # # df = df.replace(np.nan, '', regex=True)
        # df.replace(['None', 'nan'], np.nan, inplace=True)
        df = df.fillna('')

        print(df)

        print(f"Processing of year {year} finished.")

        # directory_path = "data/export/weather"
        # os.makedirs(directory_path, exist_ok=True)
        # df.to_csv(f"./data/export/weather/modified_{year}.csv", index=False)

In [6]:
df.head()

Unnamed: 0,stationcode,datelabel,param,value,mflag,qflag,sflag,time
0,ACW00011604,19490101,TMAX,28.9,,,X,
1,ACW00011604,19490101,TMIN,21.700001,,,X,
2,ACW00011604,19490101,PRCP,0.0,,,X,
3,ACW00011604,19490101,SNOW,0.0,,,X,
5,AG000060390,19490101,TMAX,16.6,,,G,


In [7]:
df.describe()

Unnamed: 0,datelabel,value
count,18089250.0,18089250.0
mean,19490670.0,4.951462
std,344.1054,13.97374
min,19490100.0,-71.1
25%,19490400.0,0.0
50%,19490700.0,0.0
75%,19491000.0,6.7
max,19491230.0,6505.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18089254 entries, 0 to 23787118
Data columns (total 8 columns):
 #   Column       Dtype  
---  ------       -----  
 0   stationcode  object 
 1   datelabel    int64  
 2   param        object 
 3   value        float32
 4   mflag        object 
 5   qflag        object 
 6   sflag        object 
 7   time         object 
dtypes: float32(1), int64(1), object(6)
memory usage: 1.1+ GB


## Export dataframe to CSV file

In [9]:
df.to_csv("./data/export/weather/modified_y.csv", index=False)

OSError: Cannot save file into a non-existent directory: 'data\export\weather'