In [1]:
import pandas as pd
import numpy as np
import time
import requests

In [2]:
raw_file_url = "../../data/raw/environment data.csv"
raw_dest = "../../data/raw/environment data - with counties.csv"

raw_grand_isle_url = "../../data/raw/grand isle.csv"
raw_grand_isle_dest = "../../data/raw/grand isle - with counties.csv"

relevant_file_url = "../../data/raw/environment data - with counties.csv"
relevant_grand_isle_url = "../../data/raw/grand isle - with counties.csv"

relevant_cleaned_dest = "../../data/cleaned/environment/cleaned environment data.csv"

In [3]:
def load_raw_data(url):
    # Load the data
    data = pd.read_csv(url)
    # Strip extra spaces from column names
    data.rename(columns=lambda x: x.strip(), inplace=True)
    # Insert column "COUNTY" into 4th index with no values and allowing duplicates
    data.insert(4, "COUNTY", None, True)

    return data

def convert_coords_to_county(data, dest, write=True):
# CONVERT LAT AND LONG --> COUNTY NAME
    stations = data.STATION.unique()
    start = time.time()
    for station in stations:
        # get lat and long values from the new data for that station
        latitude, longitude = data[data['STATION'] == station].values[0][2:4]
        # create json payload with corresponding lat long values
        payload = { 'latitude': latitude, 'longitude': longitude, 'format': 'json' }
        r = requests.get('https://geo.fcc.gov/api/census/area', params=payload).json()
        # get county names for each lat long and fill in new_data
        county = r['County']['name']
        data.loc[data['STATION'] == station, 'COUNTY'] = county
    print("County names generated in {} seconds.".format(time.time() - start))

    # Write to file
    if write:
        data.to_csv(dest, index=False)

    return data

def format_raw_data(url, dest, write=False):
    data = load_raw_data(url)
    formatted_data = convert_coords_to_county(data, dest, write)
    
    return formatted_data

In [4]:
# CLEANING THE DATA FURTHER

def load_relevant_data(url):
    # load the data
    data = pd.read_csv(url)
    # replace cells with only spaces with NaN
    df = data.copy().replace(r'^\s*$', np.nan, regex=True)

    # Isolate relevant columns
    relevant_cols = ["NAME", "LATITUDE", "LONGITUDE", "COUNTY", "ELEVATION", "DATE", "CDSD", "EMXP", "PRCP", "CLDD", "DT00", "DT32", "DX32", "DX70", "DX90", "EMNT", "EMXT", "FZF0", "FZF1", "FZF2", "FZF3", "FZF4", "FZF5", "FZF6", "FZF7", "FZF8", "FZF9", "HTDD", "TAVG", "TMAX", "TMIN", "SNOW"]

    # Isolate data from relevant columns and drop duplicate rows
    relevant = df[relevant_cols].drop_duplicates()
    # drop columns with no null values / we don't care about for cleaning
    relevant = relevant.sort_values(by=["COUNTY", "DATE"]).reset_index(drop=True).drop(["NAME", "LATITUDE", "LONGITUDE", "ELEVATION"], axis=1)
    #display(relevant.head(3))

    return relevant

# CLEAN UP NULL VALUES IN DATA

def clean_null_by_county(data):
    df = data.copy()
    # group the data by county and date
    relevant_groupby = df.groupby(["COUNTY", "DATE"])

    for county in df["COUNTY"].unique():
        # slice dataframe to only have data from a single county
        county_data = df[df["COUNTY"] == county]
        for year in county_data["DATE"].unique():
            # get current slice of data from that year for that county
            current_group = relevant_groupby.get_group((county, year))
            # get the names of the columns that are missing all their data
            missing_cols = current_group.loc[:, current_group.isna().sum() == current_group.shape[0]].columns
            # get the names of the columns that have some missing data but not all
            other_cols = [col for col in current_group.columns if col not in missing_cols]
            # fill in the missing values in the columns in other_cols with their respective medians
            current_group[other_cols] = current_group[other_cols].fillna(current_group[other_cols].median())
            # save values in current group back into original dataframe ('relevant')
            df.update(current_group, overwrite=False)

    return df

# CLEAN UP REMAINING NULL VALUES
def clean_remaining_nulls(df):
    # SPLIT THE DATA INTO TWO - ONE FOR PREDICTIONS, ONE FOR CLUSTERING
    clustering = df.copy()
    clustering = clustering.fillna(-9999999) # WIP

    predictions = df.copy()
    for col in predictions.columns:
        missing_percentage = df[col].isna().sum() / predictions.shape[0] * 100
        # set a threshhold to drop columns
        if missing_percentage < 50.0:
            predictions[col] = predictions[col].interpolate(method="linear")
        # if it exceeds it, drop the column?
        else:
            predictions = predictions.drop(columns=[col], axis=1)

    predictions = predictions.fillna(predictions.median()).groupby(["COUNTY", "DATE"]).median().reset_index()
    
    
    return clustering, predictions

def clean_env_data(url, dest, fill=True, write=True):
    cleaning_time_start = time.time()
    
    data = load_relevant_data(url)
    cleaned_data = clean_null_by_county(data)
    
    if fill:
        cleaned_clustering, cleaned_predictions = clean_remaining_nulls(cleaned_data)
        if write:
            cleaned_clustering.to_csv(dest.replace(".csv", " - clustering.csv"), index=False)
            cleaned_predictions.to_csv(dest.replace(".csv", " - clustering.csv"), index=False)
        print("Data cleaned in {} seconds.".format(time.time() - cleaning_time_start))
        return cleaned_clustering, cleaned_predictions
    else:
        if write:
            cleaned_data.to_csv(dest, index=False)
        print("Data cleaned in {} seconds.".format(time.time() - cleaning_time_start))
    
        return cleaned_data

def completely_clean(raw_url, raw_dest, relevant_url, relevant_dest, write_raw=False, fill_all_nulls=True, write_relevant=True):
    formatted_data = format_raw_data(raw_url, raw_dest, write_raw)
    fully_cleaned_data = clean_env_data(relevant_url, relevant_dest, fill, write_relevant)

    return formatted_data, fully_cleaned_data

In [51]:
## Testing raw data generation
raw_environment_data = format_raw_data(raw_file_url, raw_dest, True)
raw_grand_isle = format_raw_data(raw_grand_isle_url, raw_grand_isle_dest, True)

# Run once
# raw_combined = pd.concat([raw_environment_data, raw_grand_isle])
# raw_combined.to_csv(raw_dest, index=False)

# Testing loading relevant data (raw)
# formatted_grand_isle = load_relevant_data(relevant_grand_isle_url)
# formatted_all = load_relevant_data(relevant_file_url)
# merged = pd.concat([formatted_grand_isle, formatted_all])

County names generated in 26.778244733810425 seconds.
County names generated in 0.16954851150512695 seconds.


In [7]:
clustering, predictions = clean_env_data(relevant_file_url, relevant_cleaned_dest, True, True)

Data cleaned in 11.160754203796387 seconds.
