In [1]:
import pandas as pd

###  Reading_file   

In [2]:
dataframe = pd.read_csv('mtcars.csv')

### Statistics for columns     

In [31]:
dataframe.describe()   

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,am,gear,carb
count,32.0,32.0,31.0,32.0,31.0,31.0,31.0,31.0,30.0,31.0
mean,20.090625,6.1875,226.841935,146.6875,3.569677,3.210065,17.874516,0.419355,3.7,2.83871
std,6.026948,1.785922,123.996142,68.562868,0.521066,0.993773,1.810429,0.50161,0.749713,1.634967
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,3.0,1.0
25%,15.425,4.0,,96.5,,,,,,
50%,19.2,6.0,,123.0,,,,,,
75%,22.8,8.0,,180.0,,,,,,
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,5.0,8.0


### Summary of dataframe 

In [4]:
dataframe.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 12 columns):
Unnamed: 0    32 non-null object
mpg           32 non-null float64
cyl           32 non-null int64
disp          31 non-null float64
hp            32 non-null int64
drat          31 non-null float64
wt            31 non-null float64
qsec          31 non-null float64
vs            32 non-null object
am            31 non-null float64
gear          30 non-null float64
carb          31 non-null float64
dtypes: float64(8), int64(2), object(2)
memory usage: 3.1+ KB


### Column names   

In [5]:
dataframe.columns       

Index([u'Unnamed: 0', u'mpg', u'cyl', u'disp', u'hp', u'drat', u'wt', u'qsec',
       u'vs', u'am', u'gear', u'carb'],
      dtype='object')

### Changing Index   

In [6]:
dataframe.set_index('Unnamed: 0', inplace=True)
dataframe.index.names = ['']

In [7]:
dataframe.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
,,,,,,,,,,,
Mazda RX4,21.0,6.0,160.0,110.0,3.9,2.62,16.46,V-shaped,1.0,4.0,4.0
Mazda RX4 Wag,21.0,6.0,160.0,110.0,3.9,2.875,17.02,V-shaped,1.0,4.0,4.0
Datsun 710,22.8,4.0,108.0,93.0,3.85,2.32,18.61,straight,1.0,4.0,1.0
Hornet 4 Drive,21.4,6.0,258.0,110.0,3.08,3.215,19.44,straight,0.0,3.0,1.0
Hornet Sportabout,18.7,8.0,360.0,175.0,3.15,3.44,17.02,V-shaped,0.0,3.0,2.0


### Checking columns  datatypes  

In [8]:
# dataframe.info() works as well
for col in dataframe.columns:
    print col, dataframe[col].dtype

mpg float64
cyl int64
disp float64
hp int64
drat float64
wt float64
qsec float64
vs object
am float64
gear float64
carb float64


In [9]:
dataframe.mpg.dtype

dtype('float64')

### Changing the datatypes

In [10]:
dataframe.hp = dataframe.hp.astype('int') 

### Finding missing values 

In [11]:
dataframe.isnull().sum()

mpg     0
cyl     0
disp    1
hp      0
drat    1
wt      1
qsec    1
vs      0
am      1
gear    2
carb    1
dtype: int64

### Imputing Missing values  (Numeric) 

In [12]:
numerical_attributes = [col for col in dataframe if dataframe[col].dtype != 'object']

numerical_attributes

['mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'am', 'gear', 'carb']

In [13]:
from sklearn.preprocessing import Imputer 

In [14]:
imputer = Imputer(missing_values='NaN', strategy = 'mean', axis= 0 )

In [15]:
imputed_dataframe_numeric =  pd.DataFrame(imputer.fit_transform(dataframe[numerical_attributes]), 
                                                                columns = numerical_attributes)

In [16]:
# Double check
imputed_dataframe_numeric.isnull().sum()

mpg     0
cyl     0
disp    0
hp      0
drat    0
wt      0
qsec    0
am      0
gear    0
carb    0
dtype: int64

In [17]:
# Copying the index
imputed_dataframe_numeric.index = dataframe.index

###  Imputing Missing values  (Categorical)   

In [18]:
# The only column in the dataframe that is a string is 'vs' but there are no missing values in it. So we skip this step
# imputed_dataframe_categorical = dataframe['vs'].apply(lambda x : x.fillna(x.value_counts().index[0])) 

### Categorical  to Numeric 

In [19]:
dataframe_dummies_vs = pd.get_dummies(dataframe['vs'], prefix='vs')

###  Scaling the Numerical Attribute (Range)     

In [20]:
# Concatenating the dataframes 
final_df = pd.concat([imputed_dataframe_numeric, dataframe_dummies_vs],axis = 1)

In [21]:
from sklearn.preprocessing import MinMaxScaler                          

In [22]:
minmax_scaler = MinMaxScaler() 

In [23]:
# Converts everything on scale of 0 to 1
minmax_scaler.fit_transform(final_df)  # returns an array

array([[ 0.45106383,  0.5       ,  0.22175106,  0.204947  ,  0.52534562,
         0.28304781,  0.23333333,  1.        ,  0.5       ,  0.42857143,
         1.        ,  0.        ],
       [ 0.45106383,  0.5       ,  0.22175106,  0.204947  ,  0.52534562,
         0.34824853,  0.3       ,  1.        ,  0.5       ,  0.42857143,
         1.        ,  0.        ],
       [ 0.52765957,  0.        ,  0.0920429 ,  0.14487633,  0.50230415,
         0.20634109,  0.48928571,  1.        ,  0.5       ,  0.        ,
         0.        ,  1.        ],
       [ 0.46808511,  0.5       ,  0.46620105,  0.204947  ,  0.14746544,
         0.43518282,  0.58809524,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [ 0.35319149,  1.        ,  0.72062859,  0.43462898,  0.1797235 ,
         0.49271286,  0.3       ,  0.        ,  0.        ,  0.14285714,
         1.        ,  0.        ],
       [ 0.32765957,  0.5       ,  0.38388626,  0.18727915,  0.        ,
         0.49782664,  

In [24]:
dataframe_generalized = pd.DataFrame(minmax_scaler.fit_transform(final_df), 
                                     columns= final_df.columns, index = final_df.index)

In [25]:
dataframe_generalized.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,am,gear,carb,vs_V-shaped,vs_straight
,,,,,,,,,,,,
Mazda RX4,0.451064,0.5,0.221751,0.204947,0.525346,0.283048,0.233333,1.0,0.5,0.428571,1.0,0.0
Mazda RX4 Wag,0.451064,0.5,0.221751,0.204947,0.525346,0.348249,0.3,1.0,0.5,0.428571,1.0,0.0
Datsun 710,0.52766,0.0,0.092043,0.144876,0.502304,0.206341,0.489286,1.0,0.5,0.0,0.0,1.0
Hornet 4 Drive,0.468085,0.5,0.466201,0.204947,0.147465,0.435183,0.588095,0.0,0.0,0.0,0.0,1.0
Hornet Sportabout,0.353191,1.0,0.720629,0.434629,0.179724,0.492713,0.3,0.0,0.0,0.142857,1.0,0.0


### Scaling the Numerical Attribute  (Standardize)  (Mean = 0, SD = 1) 

In [26]:
from sklearn.preprocessing import StandardScaler                         

In [27]:
std_scalar = StandardScaler()

In [28]:
# Converts all values to z-statistic with mean = 0 and unit variance
std_scalar.fit_transform(final_df)  # Returns Numpy array 

array([[  1.53299135e-01,  -1.06667720e-01,  -5.56743566e-01,
         -5.43654869e-01,   6.54726314e-01,  -6.13234762e-01,
         -8.06939227e-01,   1.19552516e+00,   4.20341666e-01,
          7.33577633e-01,   8.81917104e-01,  -8.81917104e-01],
       [  1.53299135e-01,  -1.06667720e-01,  -5.56743566e-01,
         -5.43654869e-01,   6.54726314e-01,  -3.48221598e-01,
         -4.87475944e-01,   1.19552516e+00,   4.20341666e-01,
          7.33577633e-01,   8.81917104e-01,  -8.81917104e-01],
       [  4.56736599e-01,  -1.24445674e+00,  -9.89864857e-01,
         -7.95569902e-01,   5.55622233e-01,  -9.25014955e-01,
          4.19571594e-01,   1.19552516e+00,   4.20341666e-01,
         -1.16149792e+00,  -1.13389342e+00,   1.13389342e+00],
       [  2.20729683e-01,  -1.06667720e-01,   2.59523483e-01,
         -5.43654869e-01,  -9.70580610e-01,   5.12928705e-03,
          8.93061817e-01,  -8.63434834e-01,  -9.80797222e-01,
         -1.16149792e+00,  -1.13389342e+00,   1.13389342e+00],
    

In [29]:
dataframe_standardized = pd.DataFrame(std_scalar.fit_transform(final_df), 
                                     columns= final_df.columns, index = final_df.index)

In [30]:
dataframe_standardized.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,am,gear,carb,vs_V-shaped,vs_straight
,,,,,,,,,,,,
Mazda RX4,0.153299,-0.106668,-0.556744,-0.543655,0.654726,-0.613235,-0.806939,1.195525,0.420342,0.733578,0.881917,-0.881917
Mazda RX4 Wag,0.153299,-0.106668,-0.556744,-0.543655,0.654726,-0.348222,-0.487476,1.195525,0.420342,0.733578,0.881917,-0.881917
Datsun 710,0.456737,-1.244457,-0.989865,-0.79557,0.555622,-0.925015,0.419572,1.195525,0.420342,-1.161498,-1.133893,1.133893
Hornet 4 Drive,0.22073,-0.106668,0.259523,-0.543655,-0.970581,0.005129,0.893062,-0.863435,-0.980797,-1.161498,-1.133893,1.133893
Hornet Sportabout,-0.234427,1.031121,1.109108,0.41955,-0.831835,0.238964,-0.487476,-0.863435,-0.980797,-0.529806,0.881917,-0.881917
