# 1. Correlation Coefficent

In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Code starts here

#Loading of data
ames = pd.read_csv(r"C:\Users\tdhoble\Downloads\feature_selection.csv")


X=ames.drop(['SalePrice'],1)
y=ames['SalePrice'].copy()

#Splitting of data
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0) 

#Creating temp. dataframe
X_train['Class']=y_train
t_corr=X_train.corr()
t_corr=t_corr['Class']

print(t_corr.head())

#Selecting columns having correlation higher than 0.5
corr_columns=t_corr[abs(t_corr)>0.5].index

#Dropping the column `Class`
corr_columns=corr_columns.drop('Class')
print(corr_columns)
#Updating train and test dataframes
X_train_new=X_train[corr_columns]

X_test_new=X_test[corr_columns]

#Initialising the model
model=LinearRegression()

#Fitting the model
model.fit(X_train_new,y_train)

#Finding the score of the model
corr_score=model.score(X_test_new,y_test)
print(corr_score)

#Checking how many columns were selected
print(len(X_train_new.columns))

Id            -0.022336
MSSubClass    -0.081002
MSZoning      -0.127442
LotFrontage    0.192883
LotArea        0.258990
Name: Class, dtype: float64
Index(['OverallQual', 'YearBuilt', 'YearRemodAdd', 'ExterQual', 'BsmtQual',
       'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'KitchenQual',
       'TotRmsAbvGrd', 'GarageCars', 'GarageArea'],
      dtype='object')
0.722705662820107
13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# 2. Chi Squared Test

In [29]:
# import packages
from sklearn.feature_selection import chi2

from sklearn.feature_selection import SelectKBest

# Code starts here

X=ames.drop(['SalePrice'],1)
y=ames['SalePrice'].copy()


#Splitting dataframe into test and train
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0) 

print("X_train:",X_train.shape)
print("X_test:",X_test.shape)

#Initialising the score function
test = SelectKBest(score_func=chi2, k=60)

#Fitting and transforming the model on X_train
X_train_best = test.fit_transform(X_train, y_train)

#Fitting and transforming the model on X_test
X_test_best = test.transform(X_test)

print("X_train_best:",X_train_best.shape)
print("X_test_best:",X_test_best.shape)

#Initialising the Linear Regression model
model=LinearRegression()

#Fitting the model
model.fit(X_train_best,y_train)

#Finding the model score
chi2_score=model.score(X_test_best,y_test)

print("chi2_score: ",chi2_score)
# Code ends here

X_train: (1022, 80)
X_test: (438, 80)
X_train_best: (1022, 60)
X_test_best: (438, 60)
chi2_score:  0.7526152480701623



# 3. ANOVA 

In [31]:
# import packages
import pandas as pd
from sklearn.feature_selection import f_regression

from sklearn.feature_selection import SelectKBest

# Code starts here
X=ames.drop(['Id','SalePrice'],1)
y=ames['SalePrice'].copy()

# Splitting the dataframe into train and test
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0) 

print("X_train:",X_train.shape)
print("X_test:",X_test.shape)

#Initalising the score function
test = SelectKBest(score_func=f_regression, k=60)


#Fitting and transforming the model on X_train
X_train_best = test.fit_transform(X_train, y_train)

#Fitting and transforming the model on X_test
X_test_best = test.transform(X_test)


print("X_train_best:",X_train_best.shape)
print("X_test_best:",X_test_best.shape)

#Initialising the Linear Regression Model
model=LinearRegression()

#Fitting the model
model.fit(X_train_best,y_train)

#Finding the model score
f_regress_score=model.score(X_test_best,y_test)
print("f_regress_score: ", f_regress_score)
# Code ends here

X_train: (1022, 79)
X_test: (438, 79)
X_train_best: (1022, 60)
X_test_best: (438, 60)
f_regress_score:  0.7566701199447428


# 4. Wrapper methods

In [32]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier

#no of features list
nof_list=[20,30,40,50,60,70,80]

#Variable to store the highest score
high_score=0

#Variable to store the optimum features
nof=0

#Code begins here
X = ames.drop(['SalePrice'],1)
y=ames['SalePrice'].copy()

#Loop to select the optimum features
for n in nof_list:
    X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0) 
    model = LinearRegression()
    rfe = RFE(model, n)
    X_train_rfe = rfe.fit_transform(X_train, y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    if model.score(X_test_rfe,y_test)>high_score:
        high_score=model.score(X_test_rfe,y_test)
        nof=n


#Printing the no. features with the highest score along with the highest score
print("No. of features=",nof, "gives the best score=",high_score)

No. of features= 30 gives the best score= 0.7627843526086595


# 5. Embedded methods

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

# Code starts here
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0) 

#Initialsing \ lasso model
lasso = Lasso(random_state=0)

# Fitting the model with train
lasso.fit(X_train, y_train)

#Finding the score of model
lasso_score=lasso.score(X_test,y_test)
print("lasso_score: ",lasso_score)

# checking how many feature coefficients are zero
print("lasso coef_: ",sum(lasso.coef_ == 0))

#Initialising the ridge model
ridge=Ridge(random_state=0)
# Fitting the model with train
ridge.fit(X_train, y_train)

#Finding the score of the model
ridge_score = ridge.score(X_test,y_test)
print("ridge_score: ",ridge_score)

# checking how many feature coefficients are zero
print("ridge coef_: ",sum(ridge.coef_ == 0))
# Code ends here

lasso_score:  0.6671828548745306
lasso coef_:  0
ridge_score:  0.67883944067785
ridge coef_:  0


# 6. PCA

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 


# Code starts here
X=ames.drop(['SalePrice'],1)
y=ames['SalePrice'].copy()

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, random_state=0) 
 
#Initialising standard scaler 
scaler=StandardScaler()

#Scaling the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("X_train_scaled: ", X_train_scaled.shape)
print("X_test_scaled: ", X_test_scaled.shape)
#Initialising PCA
pca = PCA(n_components=35, random_state=0)

#Transforming the features
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

print("X_train_pca: ", X_train_pca.shape)
print("X_test_pca: ", X_test_pca.shape)

#Initialising the model
model=LinearRegression()

#Fitting the model
model.fit(X_train_pca,y_train)

#Scoring the model
pca_score=model.score(X_test_pca,y_test)
print("pca_score: ",pca_score)

X_train_scaled:  (1022, 80)
X_test_scaled:  (438, 80)
X_train_pca:  (1022, 35)
X_test_pca:  (438, 35)
pca_score:  0.7614586077016585


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
