In [1]:
import pandas as pd
import numpy as np

In [4]:
test_data = pd.read_csv("test-set-values.csv")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14850 entries, 0 to 14849
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     14850 non-null  int64  
 1   amount_tsh             14850 non-null  float64
 2   date_recorded          14850 non-null  object 
 3   funder                 13981 non-null  object 
 4   gps_height             14850 non-null  int64  
 5   installer              13973 non-null  object 
 6   longitude              14850 non-null  float64
 7   latitude               14850 non-null  float64
 8   wpt_name               14850 non-null  object 
 9   num_private            14850 non-null  int64  
 10  basin                  14850 non-null  object 
 11  subvillage             14751 non-null  object 
 12  region                 14850 non-null  object 
 13  region_code            14850 non-null  int64  
 14  district_code          14850 non-null  int64  
 15  lg

In [5]:
def wrangle_data(df):
    
    # Drop non-relevant columns
    # Features here either share high covariance with other features 
    # Or are not highly correlated with target variables
    df = df.drop(['wpt_name',"scheme_name","quantity_group",
                  "scheme_management","management",
                  "source_type",'source', 'waterpoint_type_group', 
                  'district_code','public_meeting',
                    'recorded_by','subvillage', 'extraction_type_group',
                    'extraction_type', 'payment_type', 'management_group',
                   'quality_group',"amount_tsh"], axis=1)
    
    # region_code --> categorical
    df['region_code'] = df['region_code'].astype(str)
    
    # date_recorded --> split by year, month, and day
    df[['year', 'month', 'day']] = df['date_recorded'].str.split(pat='-', n=-1, expand=True)
    
    # drop date_recorded
    df = df.drop('date_recorded', axis=1)
    
    # Transform year, month, and day into integers
    df[['year', 'month', 'day']] = df[['year', 'month', 'day']].astype(int)
    
    # Change all construction years with zero values to year of record
    df['construction_year'].loc[df['construction_year'] == 0] = df['year']
    
    # Split Data into seperate Numeric and Categorical dataframes
    def split(df):
        numerics = ['int64', 'float64']
        df_num = df.select_dtypes(include=numerics)
        df_cat = df.drop(df_num, axis='columns')
        print(df.shape, df_num.shape, df_cat.shape)
        return df_num, df_cat
    
    df_num, df_cat = split(df)
    
    #-------------- Feature Engineering ------------------#
    
    # Create 'age at inspection' feature subtracting the year inspected from the year constructed
    df_num['age_of_well'] = df_num['year'] - df_num['construction_year']
    
    # 
    mean_lat_train = df_num['latitude'].mean()
    mean_long_train = df_num['longitude'].mean()
    df_num['distance_x_height'] = np.sqrt((df_num['gps_height']**2 + df_num['longitude'] - mean_long_train)**2 + (df_num['latitude'] - mean_lat_train)**2)
    
    df_num = df_num.drop(['year', 'month', 'day'], axis=1)
    df_num = df_num.drop('construction_year', axis=1)
    
    return df_num, df_cat

In [6]:
test_num, test_cat = wrangle_data(test_data)

ValueError: Columns must be same length as key