In [3]:
import sys
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings('ignore')

# sk learn import 
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier

sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Imputer

from scipy.stats import skew


In [2]:
train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\AibHack\data\training.txt", sep="\t", encoding = 'iso-8859-1')
test = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\AibHack\data\testing.txt", sep="\t", encoding = 'iso-8859-1')

Unnamed: 0,BuildingID,County,Type,Year,RatingType,EnergyRatingCat,EnergyRatingCont,GroundFloorArea,AvgWallU,AvgRoofU,...,EffSecondHeatGenPlant,PercSecondHeat,FuelTypeThirdBoiler,EffThirdHeatGenPlant,PercThirdHeat,HeatingSystemSolarSpace,TotPrimaryEnergyFac,TotCO2Emissions,PrimaryEnergySecondarySpace,EnergyMainWater
0,190835,Co. Wexford,Mid-terrace house,Y1890.0,Existing,F,418.42,52.04 sq. m,2.1,2.3,...,,,,,,,,,0.0,2335.6809
1,523113,Co. Clare,Detached house,Y1901.0,Existing,G,520.35,147.44 sq. m,2.01,0.75,...,,,,,,,,,7144.838,4470.0759


In [4]:
print ('data loaded')
print (str(len(train))+" rows for training set")
print (str(len(test))+" rows for test set")
print(train.columns)
print(test.columns)


data loaded
464514 rows for training set
159262 rows for test set
Index(['BuildingID', 'County', 'Type', 'Year', 'RatingType', 'EnergyRatingCat',
       'EnergyRatingCont', 'GroundFloorArea', 'AvgWallU', 'AvgRoofU',
       ...
       'EffSecondHeatGenPlant', 'PercSecondHeat', 'FuelTypeThirdBoiler',
       'EffThirdHeatGenPlant', 'PercThirdHeat', 'HeatingSystemSolarSpace',
       'TotPrimaryEnergyFac', 'TotCO2Emissions', 'PrimaryEnergySecondarySpace',
       'EnergyMainWater'],
      dtype='object', length=141)
Index(['BuildingID', 'County', 'Type', 'Year', 'RatingType', 'EnergyRatingCat',
       'EnergyRatingCont', 'GroundFloorArea', 'AvgWallU', 'AvgRoofU',
       ...
       'EffSecondHeatGenPlant', 'PercSecondHeat', 'FuelTypeThirdBoiler',
       'EffThirdHeatGenPlant', 'PercThirdHeat', 'HeatingSystemSolarSpace',
       'TotPrimaryEnergyFac', 'TotCO2Emissions', 'PrimaryEnergySecondarySpace',
       'EnergyMainWater'],
      dtype='object', length=141)


In [5]:
print ("\n\n---------------------")
print ("TRAIN SET INFORMATION")
print ("---------------------")
print ("Shape of data set:", train.shape, "\n")
print ("Column Headers:", list(train.columns.values), "\n")
print (train.dtypes)



---------------------
TRAIN SET INFORMATION
---------------------
Shape of data set: (464514, 141) 

Column Headers: ['BuildingID', 'County', 'Type', 'Year', 'RatingType', 'EnergyRatingCat', 'EnergyRatingCont', 'GroundFloorArea', 'AvgWallU', 'AvgRoofU', 'AvgFloorU', 'AvgWindowU', 'AvgDoorU', 'ExposedWallArea', 'ExposedRoofArea', 'TotFloorArea', 'TotWindowArea', 'TotDoorArea', 'NStoreys', 'MainSHFuel', 'MainWHFuel', 'MainSHEfficiency', 'SharedMPRN', 'BuildingReg', 'MainSHAdj', 'SecSHFrac', 'SecSHEfficiency', 'MainWHEfficiency', 'MainWHAdj', 'SecSHFuel', 'SecWHFuel', 'NChimneys', 'NOpenFlues', 'NFansVents', 'DLobby', 'VentMethod', 'WallStructure', 'SuspWoodFloor', 'PercDraughtStripped', 'NShelteredSides', 'PermTest', 'PermTestResult', 'TempAdj', 'ControlCat', 'ResponseCat', 'NCHPumps', 'BoilerStat1', 'BoilerStat2', 'OilPumpIn', 'NGasFans', 'WarmAir', 'Underfloor', 'DistLoss', 'StorageLoss', 'DeclaredLoss', 'SolarWaterHeating', 'SummerElectricImm', 'Combi', 'KeepHot', 'VolWaterStorage',

In [6]:
import re
missing_values = []
nonumeric_values = []

print ("TRAIN SET INFORMATION")
print ("========================\n")

for column in train:
    # Find all the unique feature values
    uniq = train[column].unique()
    print ("'{}' has {} unique values" .format(column,uniq.size))
    if (uniq.size > 17):
        print("~~Listing up to 17 unique values~~")
    print (uniq[0:17])
    print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(train[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    for i in range (1, np.prod(uniq.shape)):
        if (re.match('nan', str(uniq[i]))):
            break
        if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            nonumeric_values.append(column)
            break
  
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TRAIN SET INFORMATION

'BuildingID' has 464514 unique values
~~Listing up to 17 unique values~~
[190835 523113 271393 585961 624083  30231 350444 592196 365924 303002
 634938 300824 449623 379988  68299 597104 163298]

-----------------------------------------------------------------------

'County' has 56 unique values
~~Listing up to 17 unique values~~
['Co. Wexford' 'Co. Clare' 'Co. Kerry' 'Co. Cork' 'Co. Laois' 'Cork City'
 'Co. Carlow' 'Co. Offaly' 'Co. Donegal' 'Co. Meath' 'Co. Cavan' 'Dublin 4'
 'Co. Galway' 'Dublin 14' 'Dublin 6' 'Co. Roscommon' 'Dublin 17']

-----------------------------------------------------------------------

'Type' has 12 unique values
['Mid-terrace house' 'Detached house' 'Semi-detached house'
 'Ground-floor apartment' 'Mid-floor apartment' 'End of terrace house'
 'Top-floor apartment' 'House' 'Maisonette' 'Apartment' 'Basement Dwelling'
 nan]

-----------------------------------------------------------------------

'Year' has 249 unique values
~~Listing

In [13]:
def is_outlier(points, thresh = 3.5):
    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

In [8]:
 df = train.append(test,ignore_index = True)

In [10]:
cats = []
for col in df.columns.values:
    if df[col].dtype == 'object':
        cats.append(col)

In [11]:
df_cont = df.drop(cats, axis=1)
df_cat = df[cats]

In [14]:
for col in df_cont.columns.values:
    if np.sum(df_cont[col].isnull()) > 50:
        #print("Removing Column: {}".format(col))
        df_cont = df_cont.drop(col, axis = 1)
    elif np.sum(df_cont[col].isnull()) > 0:
        #print("Replacing with Median: {}".format(col))
        median = df_cont[col].median()
        idx = np.where(df_cont[col].isnull())[0]
        df_cont[col].iloc[idx] = median
        
        
        outliers = np.where(is_outlier(df_cont[col]))
        df_cont[col].iloc[outliers] = median
        
               
        if skew(df_cont[col]) > 0.75:
            #print("Skewness Detected: {}".format(col))
            df_cont[col] = np.log(df_cont[col])
            df_cont[col] = df_cont[col].apply(lambda x: 0 if x == -np.inf else x)
        
        df_cont[col] = Normalizer().fit_transform(df_cont[col].reshape(1,-1))[0]

In [None]:
for col in df_cat.columns.values:
    if np.sum(df_cat[col].isnull()) > 50:
        df_cat = df_cat.drop(col, axis = 1)
        continue
    elif np.sum(df_cat[col].isnull()) > 0:
        df_cat[col] = df_cat[col].fillna('MIA')
        
    df_cat[col] = LabelEncoder().fit_transform(df_cat[col])
    
    num_cols = df_cat[col].max()
    for i in range(num_cols):
        col_name = col + '_' + str(i)
        df_cat[col_name] = df_cat[col].apply(lambda x: 1 if x == i else 0)
        
    df_cat = df_cat.drop(col, axis = 1)

#### 2. Summarize data
Descriptive statistics

In [1]:
#Let's take a brief look at all numerical columns statistcs:
train.describe(include =['number'])

NameError: name 'train' is not defined

In [None]:
f = lambda x : x.strip('sq. m')
train['GroundFloorArea'] = list(map(f,train['GroundFloorArea']))