### TODO
- Re-run StationRecords with FULL dataset and check all cells
- Check why there are more records in the DB than in the data object

# StationDetails

## Objective

- Clean the StationDetails data
- Enrich the data by adding the weather station location details (e.g. city, state, country etc)
- Load the data into the MySQL

## Load Data

In [815]:
StationDetails = pd.read_csv("/Users/todddequincey/globalwarming/data/StationDetails.csv")

### Append Geolocation Details

In [816]:
# Read in API key
key = open("/Users/todddequincey/Desktop/api_key.txt","r")
key = key.readline()

In [817]:
# Import required libaries
from urllib.request import urlopen
import json

# Get the location of each weather station
def getlocation(lat, lon):
    # Set return vars to none in case of errors. Return None to preserve indexing for joining to df
    street_number = None
    street_name = None
    locality = None
    region1 = None
    region2 = None
    country = None
    post_code = None
    formatted_address = None
    new_lat = None
    new_lon = None
    results = {}
    
    # Try to get the geocoded data from Google Maps API
    try:
        url = "https://maps.googleapis.com/maps/api/geocode/json?"
        url += "latlng=%s,%s&sensor=false&key=%s" % (lat, lon, key)
        v = urlopen(url).read()
        j = json.loads(v)

        if j['status'] == 'ZERO_RESULTS':
            # pass to store None results
            pass
        else: 
            address_components = j['results'][0]    
            # Check if individual address elements exist and update vars
            for c in address_components['address_components']:
                if "street_number" in c['types']:
                    street_number = c['long_name']
                if "route" in c['types']:
                    street_name = c['long_name']
                if "locality" in c['types']:
                    locality = c['long_name']
                if "administrative_area_level_1" in c['types']:
                    region1 = c['long_name']            
                if "administrative_area_level_2" in c['types']:
                    region2 = c['long_name']                        
                if "country" in c['types']:
                    country = c['long_name']
                if "postal_code" in c["types"]:
                    post_code = c['long_name']
                    
            # Capture formatted address 
            formatted_address = j['results'][0]['formatted_address']   

            # Capture exact rooftop lat and lon used by Google Maps API
            new_lat = j['results'][0]['geometry']['location']['lat']
            new_lon = j['results'][0]['geometry']['location']['lng']

        # Create dictionary of results
        for i in ('street_number', 
                  'street_name', 
                  'locality', 
                  'region1', 
                  'region2', 
                  'country', 
                  'post_code', 
                  'formatted_address', 
                  'new_lat', 
                  'new_lon'):
            results[i] = locals()[i]

        return results
    
    # Set fields to None if any errors and return
    except:
        # Create dictionary of results
        for i in ('street_number', 
                  'street_name', 
                  'locality', 
                  'region1', 
                  'region2', 
                  'country', 
                  'post_code', 
                  'formatted_address', 
                  'new_lat', 
                  'new_lon'):
            results[i] = locals()[i]

        return results

In [818]:
# Column names for dataframe
col_names = ['street_number',
             'street_name',
             'locality',
             'region1',
             'region2',
             'country',
             'post_code',
             'formatted_address',
             'lat',
             'lon']

# Create an empty dataframe to store the results
#geo_info = pd.DataFrame(columns=col_names)

# Get geoinfo for each weather station and append to geo_info
#for i in range(0,len(StationDetails)):
#    results = getlocation(StationDetails.loc[i, 'Lat'], StationDetails.loc[i, 'Lon'])
#    geo_info = geo_info.append(results, ignore_index=True)

In [819]:
# Save geoinfo results to feather for future use
#feather.write_dataframe(geo_info, "/Users/todddequincey/globalwarming/data/geo_info.feather")

In [820]:
# Left join the Google API weather station data to the original dataset
StationDetails = StationDetails.join(geo_info, how='left')

In [821]:
# Fill missing values with np.nan
StationDetails = StationDetails.fillna(value=pd.np.nan)

In [822]:
# Create a list of the unique weather stations which have missing locality (city) or Country location information
null_records = StationDetails[StationDetails['locality'].isnull() | 
                              StationDetails['country'].isnull()]


In [823]:
# Count of weather stations with missing data
len(null_records['StationId'].unique())

11433

In [824]:
# Summarise the country code of weather stations with missing data (more than 100 missing values)
results = null_records.groupby('CountryId').count()
results = results.loc[results['StationId'] >100,'StationId']
results

CountryId
AR     109
AS     362
AY     204
BR     515
CA     846
CH     398
CI     130
FI     231
FR     218
GL     103
IN     272
JA     182
KZ     139
RS    1212
SF     148
SW     230
SZ     132
UK     548
US    1627
Name: StationId, dtype: int64

In [825]:
# Convert weather station id to int for all stations with missing location info
# NOTE: To be removed from the weather station records if they exist
stn_no_loc = null_records.StationId
stn_no_loc = stn_no_loc.unique().astype(int)

In [826]:
# Remove all weather stations with missing locality (city) or country location information from the dataset
# Note: Should have done this BEFORE collecting the location data from the Google API
StationDetails = StationDetails[StationDetails['locality'].notnull()] 
StationDetails = StationDetails[StationDetails['country'].notnull()]

In [827]:
# Reset the index the df
StationDetails = StationDetails.reset_index()
#StationDetails = StationDetails.reindex()

In [828]:
# Remove unnecessary columns
keep_cols = ['StationId', 'StationName', 'Elevation', 'StartDate', 'EndDate', 'locality', 'region1', 'country', 'lat', 'lon']
StationDetails = StationDetails.loc[:,keep_cols]

In [829]:
# Rename the columns
StationDetails.columns = ['StationId', 'StationName', 'Elevation', 'StartDate', 'EndDate', 'City', 'State', 'Country', 'Lat', 'Lon']

In [830]:
# Save the cleaned dataset to feather format
feather.write_dataframe(StationDetails, "/Users/todddequincey/globalwarming/data/StationDetails.feather")

# StationRecords

## Objective

- Combine individual weather station files into single annual files
- Consolidate the daily weather recordings into more meaningful monthly averages
- Complete any required data cleaning 
- Load the data into the MySQL

As the size of the monthly summarised data is expected to fit in memory, the data will be saved into a single dataframe before being imported into MySQL.

If this is not the case, each year of data will be loaded into MySQL separately. 

## Load Data

### Functions

In [751]:
# Import required libraries
import csv
import os
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import pymysql
import feather

In [779]:
# Get the names of all files in the dir
def create_annual_file(folder_path, year):
    '''
        For all of the CSV files in a given folder, combine all of the CSVs into a single annual file
        
        folder_path = String. The folder path where each of the annual folders are saved
        year = String. The calendar year file being created.
        
        returns a DataFrame
    '''
    # Full path name
    full_path = folder_path + year
    
    # Get file names in dir and convert to a list
    file_names = list(os.listdir(full_path))
    
    # Calculate the number of files in the dir
    no_files = len(file_names)
    
    # Combine the CSVs
    combined_file = pd.concat([pd.read_csv(str(full_path + "/" + f), dtype={'STATION': int}) for f in file_names])
    
    # Return combined_csv df
    return combined_file

In [780]:
# Clean the CSV
def clean_files(df):
    '''
        Clean annual files
    
        df = dataframe returned from the combined_csv function. 
        
        returns a DataFrame
    '''
    
    # Create the station_records df 
    df = df.drop(labels = ['LATITUDE', 
                           'LONGITUDE', 
                           'NAME', 
                           'ELEVATION', 
                           'PRCP_ATTRIBUTES', 
                           'TEMP_ATTRIBUTES', 
                           'DEWP_ATTRIBUTES', 
                           'SLP_ATTRIBUTES', 
                           'STP_ATTRIBUTES', 
                           'VISIB_ATTRIBUTES', 
                           'WDSP_ATTRIBUTES', 
                           'MAX_ATTRIBUTES', 
                           'MIN_ATTRIBUTES', 
                           'PRCP_ATTRIBUTES', 
                           'FRSHTT'], 
                 axis=1)

    # Replace all 999.9 and 9999.9 missing values with np.NaN
    df = df.replace(999.9, np.NaN)
    df = df.replace(9999.9, np.NaN)    
    
    # Rename the columns 
    df.columns = ['StationId', 
                  'Date', 
                  'Temp', 
                  'Dew', 
                  'SLP', 
                  'StationPressure', 
                  'Visib', 
                  'WindSpeed', 
                  'MaxWindSpeed', 
                  'Gust', 
                  'MaxTemp', 
                  'MinTemp', 
                  'Precip', 
                  'SnowDepth']

    return df

In [781]:
# Calculate monthly averages, min and max
def calc_monthly(df):
    # Calculate and add the Year and Month columns to the df
    df['Year'] = pd.DatetimeIndex(df['Date']).year
    df['Month'] = pd.DatetimeIndex(df['Date']).month

    # Drop the date column
    df = df.drop(labels='Date', axis=1)
    
    # Group the rows by StationId, year and month 
    df_grouped = df.groupby(by=['StationId','Year','Month'])

    # Calculate the avg, min and max fields and a field to count how many records are included in each calc
    df_avg = df_grouped.mean()
    df_min = df_grouped.min()
    df_max = df_grouped.max()
    df_count = df_grouped.size().to_frame()

    # Left join df_max, df_min and df_count
    df = df_avg.join(df_max.loc[:, ['MaxTemp', 'MinTemp']], rsuffix='_max')
    df = df.join(df_min.loc[:, ['MaxTemp','MinTemp']], rsuffix='_min')
    df = df.join(df_count)
    
    # Rename Min and Max temp column variants
    col_names = list(df.columns)
    col_names[8] = "AvgMaxTemp"
    col_names[9] = "AvgMinTemp"
    col_names[12] = "MaxTemp"
    col_names[13] = "MaxMinTemp"
    col_names[14] = "MinMaxTemp"
    col_names[15] = "MinTemp"
    col_names[16] = "NumberDailyRecords"
    df.columns = col_names

    # Reset the index of the df so it is no longer grouped
    df = df.reset_index()

    return df

In [782]:
# Run all functions
def run(folder_path, start_year, end_year):
    '''
        Converts individual weather station records into annual summary/file/.
        Cleans the annual summary.
        Calculates monthly summary level data
        
        file_path = String. Path to where all of the folders for each year are saved.
        start_year = Int. Start year to combine data
        end_year = Int. End year to combine data
        
        returns pd.DataFrame
    '''    
    
    # Dataframe to store the results
    results = pd.DataFrame()
    
    # Create range of years
    years = range(start_year, end_year + 1)
    
    for year in years:
        try:
            # Create annual file
            print("Creating annual file for {}...".format(year))
            df = create_annual_file(folder_path, str(year))
            print("{} file created".format(year))

            # Clean annual file
            print("Cleaning {} file...".format(year))
            df = clean_files(df)
            print("{} file cleaned".format(year))    

            # Convert the data into monthly summary
            print("Converting daily records in {} file into monthly summary...".format(year))
            df = calc_monthly(df)
            print("{} converted into monthly summary".format(year))        

            # Append the results to the dataframe
            results = results.append(df)
            print("-----------------------")
        
        # Catch all except clause for any errors
        except Exception as e:
            print("{} error with {}".format(e, year))
            print("-----------------------")

    # Reset the index
    results = results.reset_index(drop=True)
    
    # Print success message
    print("Finished loading data.")
    
    return results

### Read and Save Data

#### Load Pre-Saved Data into Jupyter

In [756]:
# Load data from feather format (fastest read and write format)
StationRecords = feather.read_dataframe("/Users/todddequincey/globalwarming/data/GSOD/1960-2019_monthly.feather")

#### Read in the raw data files

In [757]:
# Run all of the functions and save the df
# StationRecords = run("/Users/todddequincey/Downloads/GSOD/", 1960, 2019)

# Round the decimals to three places
#StationRecords = StationRecords.round(decimals=3)

In [783]:
test = run("/Users/todddequincey/Downloads/GSOD/", 1970, 1970)

Creating annual file for 1970...
1970 file created
Cleaning 1970 file...
1970 file cleaned
Converting daily records in 1970 file into monthly summary...
1970 converted into monthly summary
-----------------------
Finished loading data.


In [784]:
72224603844 in test.StationId.unique()
#7222463844 in test.StationId.unique()


True

# Unique Stations between Datasets

### Remove Redundant Records
Remove weather stations with no location information and/or set differences between StationDetails and StationRecords

#### Remove any weather station records where the weather station location information is missing

In [834]:
# Number of records to be removed
len(StationRecords[StationRecords.StationId.isin(stn_no_loc)])


111147

In [835]:
# Number of records after removal 
len(StationRecords[~StationRecords.StationId.isin(stn_no_loc)])

215967

In [836]:
# Remove all weather stations with no location infomation
StationRecords = StationRecords[~StationRecords.StationId.isin(stn_no_loc)]

#### Check for further weather stations in StationRecords not in StationDetails

In [861]:
# Unique stations in each dataset
records = set(StationRecords.StationId.unique().astype(int))
details = set(StationDetails.StationId.unique().astype(int))

# Set difference
missing_stations = records.difference(details)

In [862]:
# Number of missing stations
len(missing_stations)

49

In [870]:
# Remove weather stations from records dataset
StationRecords = StationRecords[~StationRecords.StationId.isin(missing_stations)]

In [871]:
# Number of unique weather stations which remain
len(StationRecords.StationId.unique())

6389

#### Check for stations in StationDetails which are not in StationRecords

In [883]:
# Unique stations in each dataset
records = set(StationRecords.StationId.unique().astype(int))
details = set(StationDetails.StationId.unique().astype(int))

# Set difference
missing_stations = details.difference(records)

In [884]:
# Check number of unused weather stations
len(missing_stations)

11913

In [886]:
# Remove redundant weather station records from StationDetails
StationDetails = StationDetails[~StationDetails.StationId.isin(missing_stations)]

#### Save the data to feather format

In [887]:
# Save the dataframe to feather format (fastest to read and write for future use in Python)
feather.write_dataframe(StationRecords, "/Users/todddequincey/globalwarming/data/GSOD/1960-2019_monthly.feather")
feather.write_dataframe(StationDetails, "/Users/todddequincey/globalwarming/data/StationDetails.feather")

## Load Data to MySQL

In [894]:
# Create connection to the database
engine = create_engine('mysql+pymysql://root:@localhost/globalwarming')

In [895]:
# Print progress message
print("Loading StationDetails data into MySQL...")

# Export data into MySQL
StationDetails.to_sql(con=engine,
            name='StationDetails', 
            if_exists = 'append', 
            index=False)

# Print success message
print("StationDetails data loaded to MySQL")

Loading StationDetails data into MySQL...
StationDetails data loaded to MySQL


In [896]:
# Print progress message
print("Loading StationRecords data into MySQL...")

# Export data into MySQL
StationRecords.to_sql(con=engine,
            name='StationRecords', 
            if_exists = 'append', 
            index=False)

# Print success message
print("StationRecords data loaded to MySQL")

Loading StationRecords data into MySQL...
StationRecords data loaded to MySQL
