## Install Libraries

In [1]:
! pip install pandas
! pip install requests
! pip install numpy
! pip install os



## Import Libraries

In [4]:
import pandas as pd
from ftplib import FTP
import requests   # more convenient for http(s) urls
import numpy as np
import os

## Download Data

In [2]:
url = "https://www.ncei.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt"
response = requests.get(url)
if response.ok:
    filename = url.rsplit('/', 1)[1]
    print(f"data downloaded. Will be saved as {filename}")
    directory_path = "data"
    os.makedirs(directory_path, exist_ok=True)
    with open(f"./data/{filename}", "wb") as f:
        f.write(response.content)
else:
    print("An error occured while trying to retrieve the data from the internet.")


NameError: name 'requests' is not defined

In [3]:
# # download the documentation (general readme file and station list)
url = "ftp://ftp.ncei.noaa.gov/pub/data/ghcn/daily"
site, fpath = url.split("//")[1].split("/", 1)
with FTP(site) as session:
    session.login()  # "anonymous", "your-email@uni-koeln.de")
    session.set_debuglevel(1)
    session.cwd(fpath)
    directory_path = "data"
    os.makedirs(directory_path, exist_ok=True)
    fname = "readme.txt"
    with open(f"./data/{fname}", 'wb') as local_file:
        session.retrbinary(f"RETR {fname}", local_file.write)

NameError: name 'FTP' is not defined

## Process Data

### Stations Structure:
| Column name | Index | Data type |
| --- | --- | --- |
| ID | 1-11 | Character |
| LATITUDE | 13-20 | Real |
| LONGITUDE | 22-30 | Real |
| ELEVATION | 32-37 | Real |
| STATE | 39-40 | Character |
| NAME | 42-71 | Character |
| GSN FLAG | 73-75 | Character |
| HCN/CRN FLAG | 77-79 | Character |
| WMO ID | 81-85 | Character |


In [5]:
file_path = './data/ghcnd-stations.txt'

def conv_str(x):
    return str(x)
def conv_float(x):
    return float(x)

column_specs = [
    (0, 11),   # ID
    (12, 20),  # LATITUDE
    (21, 30),  # LONGITUDE
    (31, 37),  # ELEVATION
    (38, 40),  # STATE
    (41, 71),  # NAME
    (72, 75),  # GSN FLAG
    (76, 79),  # HCN/CRN FLAG
    (80, 85)   # WMO ID
]

column_names = [
    'ID', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'STATE', 'NAME', 
    'GSN FLAG', 'HCN/CRN FLAG', 'WMO ID'
]

col_conv = {'ID': conv_str, 
            'LATITUDE': conv_float, 
            'LONGITUDE': conv_float, 
            'ELEVATION': conv_float, 
            'STATE': conv_str, 
            'NAME': conv_str, 
            'GSN FLAG': conv_str, 
            'HCN/CRN FLAG': conv_str, 
            'WMO ID': conv_str, 
}


df = pd.read_fwf(file_path, colspecs=column_specs, names=column_names, converters=col_conv)

## Rename to non-only UPPERCASE letters for the database
rename_columns = {
    'ID': 'code',
    'LATITUDE': 'lat',
    'LONGITUDE': 'lon',
    'ELEVATION': 'elevation',
    'STATE': 'state',
    'NAME': 'name',
    'GSN FLAG': 'flag1',
    'HCN/CRN FLAG': 'flag2',
    'WMO ID': 'wmo_id'
}

# Rename columns using the rename method
df.rename(columns=rename_columns, inplace=True)

# the readme-stations.txt says that WMO_ID is also a string
df['wmo_id'] = df['wmo_id'].astype(str)



# df = df.replace(np.nan, '', regex=True)
df.replace(['None', 'nan'], np.nan, inplace=True)
df = df.fillna('')

In [8]:
df.head(100)

Unnamed: 0,code,lat,lon,elevation,state,name,flag1,flag2,wmo_id
0,ACW00011604,17.1167,-61.7833,10.1,,ST JOHNS COOLIDGE FLD,,,
1,ACW00011647,17.1333,-61.7833,19.2,,ST JOHNS,,,
2,AE000041196,25.3330,55.5170,34.0,,SHARJAH INTER. AIRP,GSN,,41196
3,AEM00041194,25.2550,55.3640,10.4,,DUBAI INTL,,,41194
4,AEM00041217,24.4330,54.6510,26.8,,ABU DHABI INTL,,,41217
...,...,...,...,...,...,...,...,...,...
95,AGM00060686,21.3330,0.9500,399.0,,BORDJ-BADJ-MOKHTAR,,,60686
96,AGM00060690,19.5670,5.7670,401.0,,IN-GUEZZAM,,,60690
97,AJ000037575,41.5500,46.6670,490.0,,ZAKATALA,,,37575
98,AJ000037579,41.7000,46.8000,1746.0,,ALIBEK,,,37579


In [17]:
df.describe()

Unnamed: 0,Latitude,Longitude,Elevation
count,125988.0,125988.0,125988.0
mean,25.741179,-44.342157,447.031776
std,28.323435,89.3985,656.340957
min,-90.0,-179.9947,-999.9
25%,20.919175,-100.54005,85.0
50%,36.91685,-84.7275,257.6
75%,42.82045,2.072075,596.8
max,83.65,179.7414,5033.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125988 entries, 0 to 125987
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Station_ID    125988 non-null  object 
 1   Latitude      125988 non-null  float64
 2   Longitude     125988 non-null  float64
 3   Elevation     125988 non-null  float64
 4   State         125988 non-null  object 
 5   Station_Name  125988 non-null  object 
 6   GSN_Flag      125988 non-null  object 
 7   HCN_CRN_Flag  125988 non-null  object 
 8   WMO_ID        125988 non-null  object 
dtypes: float64(3), object(6)
memory usage: 8.7+ MB


## Export dataframe to CSV file

In [7]:
directory_path = "data/export"
os.makedirs(directory_path, exist_ok=True)
df.to_csv("./data/export/modified_stations.csv", index=False)