In [1]:
import pandas as pd
import seaborn as sns
from datetime import datetime, date
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PolynomialFeatures 

In [2]:
df_values = pd.read_csv('Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv')
df_labels = pd.read_csv('Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv')


drop_columns = ['id', 'funder', 'num_private', 'longitude', 'latitude', 'wpt_name', 'subvillage', 'region_code', 'lga',
                'ward','recorded_by', 'scheme_name', 'extraction_type_group', 'payment', 'quality_group', 
                'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group']

continuous_columns = ['amount_tsh', 'date_recorded', 'gps_height', 'population', 'construction_year']

categorical_columns = ['installer', 'basin', 'region', 'district_code', 'public_meeting',
                      'scheme_management', 'permit', 'extraction_type', 'extraction_type_class', 'management', 
                      'management_group', 'payment_type', 'water_quality', 'quantity', 'source', 'waterpoint_type']

df_original = pd.merge(df_values, df_labels, on = 'id', how = 'inner')

df_original.drop(drop_columns, axis = 1, inplace = True)

df_original.columns

Index(['amount_tsh', 'date_recorded', 'gps_height', 'installer', 'basin',
       'region', 'district_code', 'population', 'public_meeting',
       'scheme_management', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_class', 'management', 'management_group',
       'payment_type', 'water_quality', 'quantity', 'source',
       'waterpoint_type', 'status_group'],
      dtype='object')

In [5]:
def drop_useless_cols(df, drop_values = []):
    continuous_columns = ['amount_tsh', 'date_recorded', 'gps_height', 'population', 'construction_year']
    for cont in continuous_columns:
        if cont in drop_values:
            print(f'you cannot drop column: {cont}')
            return
        
    try:
        df_dropped = df.drop(drop_values, axis = 1)
        return df_dropped
    except:
        return df
    
def fix_dates(df):
    """ will take the date of 01/01/2020 and subtract it from the 'date_recorded' column.
        This information will be stored in column called 'days_since_recording'
        This will also drop the 'date_recorded' column
    """
    basedate = datetime(2020, 1, 1)
    df['days_since_recording'] = df.loc[:,'date_recorded'].map(lambda x: (basedate - datetime.strptime(x, "%Y-%m-%d")).days)
    df.drop(['date_recorded'], axis = 1, inplace = True)
    return df

def clean_data(df, threshold = 100):
    # replaces NaN with a string 'not known'
    df = df.fillna('Not Known')
    
    uvdict = {}

    for column in df.select_dtypes(exclude=['int','float']):
        values_list = df[column].unique()
        uvdict[column] = len(values_list)

    target_list = list(filter(lambda x: uvdict[x] > threshold, uvdict.keys()))
                       
                       
    for col in target_list:
        valued_dict = dict(df[col].value_counts())
        safe_values = list(key for key, value in valued_dict.items() if value >= 50)
    #     replace_values = list(filter(lambda x: x not in safe_values, all_values))
        df.loc[:, col] = df.loc[:, col].map(lambda y: 'other' if y not in safe_values else y)
    
    
    return df

def bin_me(df):
    """
        creates bins for construction_year based on 5 year increments
        inaddition, values stored as year 0 will be transformed to not_available
    """
    try:
        basedate = datetime(2020, 1, 1)
        a = list(range(1955,2016,5))
        cut_bins = [-1]
        cut_bins.extend(a)
        cut_labels = ['not available', '56-60','61-65','66-70','71-75','76-80','81-85','86-90','91-95','96-00','01-05','06-10','11-15']
        df.loc[:, 'construction_year_bin'] = pd.cut(df['construction_year'], bins = cut_bins, labels = cut_labels)
        df.drop(['construction_year'], axis = 1, inplace = True)
        return df
    except:
        if 'construction_year_bin' in df.columns:
            print('action already performed')
        else:
            print('you messed up')

def onehotmess(df):
    df_objects = df.select_dtypes(exclude=['int','float']).drop(['status_group'], axis = 1)
    df_nums = df.select_dtypes(include=['int','float'])

    df_onehot = pd.get_dummies(df_objects)

    df_final = pd.concat([df_nums, df_onehot], axis = 1)
    
    return df_final, df.status_group

def normalize_func(df_values, df_target):
    X_train, X_test, y_train, y_test = train_test_split(df_values, df_target, test_size = .05, random_state = 42)
    scaler = MinMaxScaler()
    
    X_train_ = scaler.fit_transform(X_train)
    X_test_ = scaler.transform(X_test)
    
    return X_train_, X_test_, y_train, y_test


def do_everything(string1, string2, drop_values, thresh = 200):
    """this funciton is magical and does everything we could ever want and more"""
    loaded_data = load_data(string1, string2)
    df_dropped = drop_useless_cols(loaded_data, drop_values)
    fixed_date = fix_dates(df_dropped)
    cleaner_df = clean_data(fixed_date, thresh)
    df_binned = bin_me(cleaner_df)
    ohm_df, target_df = onehotmess(df_binned)
    X_train, X_test, y_train, y_test = normalize_func(ohm_df, target_df)
    
    return X_train, X_test, y_train, y_test



In [None]:
def do_everything(string1, string2, drop_values, thresh = 200):
    
    loaded_data = load_data(string1, string2)
    df_dropped = drop_useless_cols(loaded_data, drop_values)
    fixed_date = fix_dates(df_dropped)
    cleaner_df = clean_data(fixed_date, thresh)
    df_binned = bin_me(cleaner_df)
    ohm_df, target_df = onehotmess(df_binned)
    X_train, X_test, y_train, y_test = normalize_func(ohm_df, target_df)
    
    return X_train, X_test, y_train, y_test

In [26]:
string_1 = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv'
string_2 = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv'
drop_columns = ['id', 'funder', 'num_private', 'longitude', 'latitude', 'wpt_name', 'subvillage', 'region_code', 'lga',
                'ward','recorded_by', 'scheme_name', 'extraction_type_group', 'payment', 'quality_group', 
                'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group']


do_everything(string_1, string_2, drop_columns)

(array([[0.        , 0.03146853, 0.375     , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.03146853, 0.375     , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.50244755, 0.0875    , ..., 0.        , 0.        ,
         1.        ],
        ...,
        [0.        , 0.02692308, 0.0625    , ..., 1.        , 0.        ,
         0.        ],
        [0.        , 0.03146853, 0.075     , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.48251748, 0.0125    , ..., 0.        , 0.        ,
         0.        ]]),
 array([[0.00000000e+00, 3.14685315e-02, 6.25000000e-02, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 3.14685315e-02, 7.50000000e-02, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [2.85714286e-05, 6.17132867e-01, 1.25000000e-02, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        ...,
        [0.00000000e+00, 3.14685315e

In [4]:
string_1 = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv'
string_2 = 'Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv'

def load_data(string1, string2):
    """
        pass in two strings containg csv info, this function will load the two dataframes and merge them along the column 'id'
    """
    df_1 = pd.read_csv(string1)
    df_2 = pd.read_csv(string2)
    #merging dataframes
    df = pd.merge(df_1, df_2, on = 'id', how = 'inner')
    return df

loaded_data = load_data(string_1, string_2)

In [6]:
def fix_dates(df):
    """ will take the date of 01/01/2020 and subtract it from the 'date_recorded' column.
        This information will be stored in column called 'days_since_recording'
        This will also drop the 'date_recorded' column
    """
    basedate = datetime(2020, 1, 1)
    df['days_since_recording'] = df.loc[:,'date_recorded'].map(lambda x: (basedate - datetime.strptime(x, "%Y-%m-%d")).days)
    df.drop(['date_recorded'], axis = 1, inplace = True)
    return df

fixed_date = fix_dates(df_dropped)

In [7]:
def clean_data(df, threshold = 100):
    # replaces NaN with a string 'not known'
    df = df.fillna('Not Known')
    
    uvdict = {}

    for column in df.select_dtypes(exclude=['int','float']):
        values_list = df[column].unique()
        uvdict[column] = len(values_list)

    target_list = list(filter(lambda x: uvdict[x] > threshold, uvdict.keys()))
                       
                       
    for col in target_list:
        valued_dict = dict(df[col].value_counts())
        safe_values = list(key for key, value in valued_dict.items() if value >= 50)
    #     replace_values = list(filter(lambda x: x not in safe_values, all_values))
        df.loc[:, col] = df.loc[:, col].map(lambda y: 'other' if y not in safe_values else y)
    
    
    return df

    
cleaner_df = clean_data(fixed_date)
    
    

In [8]:
cleaner_df.installer.value_counts()

DWE           17402
other          9303
Not Known      3655
Government     1825
RWE            1206
              ...  
RC Ch            52
wanan            52
Local te         52
RWE/DWE          52
CDTF             50
Name: installer, Length: 146, dtype: int64

In [9]:
def bin_me(df):
    """
        creates bins for construction_year based on 5 year increments
        inaddition, values stored as year 0 will be transformed to not_available
    """
    try:
        basedate = datetime(2020, 1, 1)
        a = list(range(1955,2016,5))
        cut_bins = [-1]
        cut_bins.extend(a)
        cut_labels = ['not available', '56-60','61-65','66-70','71-75','76-80','81-85','86-90','91-95','96-00','01-05','06-10','11-15']
        df.loc[:, 'construction_year_bin'] = pd.cut(df['construction_year'], bins = cut_bins, labels = cut_labels)
        df.drop(['construction_year'], axis = 1, inplace = True)
        return df
    except:
        if 'construction_year_bin' in df.columns:
            print('action already performed')
        else:
            print('you messed up')
            
df_binned = bin_me(cleaner_df)

In [17]:
def onehotmess(df):
    df_objects = df.select_dtypes(exclude=['int','float']).drop(['status_group'], axis = 1)
    df_nums = df.select_dtypes(include=['int','float'])

    df_onehot = pd.get_dummies(df_objects)

    df_final = pd.concat([df_nums, df_onehot], axis = 1)
    
    return df_final, df.status_group

ohm_df, target_df = onehotmess(df_binned)

In [24]:
def normalize_func(df_values, df_target):
    X_train, X_test, y_train, y_test = train_test_split(df_values, df_target, test_size = .05, random_state = 42)
    scaler = MinMaxScaler()
    
    X_train_ = scaler.fit_transform(X_train)
    X_test_ = scaler.transform(X_test)
    
    return X_train_, X_test_, y_train, y_test

X_train, X_test, y_train, y_test = normalize_func(ohm_df, target_df)