# Feature Selection Techniques - Pearson correlation

Source of data: https://archive.ics.uci.edu/ml/datasets/Air+Quality

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
np.random.seed(123)

In [2]:
##  colorful prints
def black(text):
     print('\033[30m', text, '\033[0m', sep='')  
def red(text):
     print('\033[31m', text, '\033[0m', sep='')  
def green(text):
     print('\033[32m', text, '\033[0m', sep='')  
def yellow(text):
     print('\033[33m', text, '\033[0m', sep='')  
def blue(text):
     print('\033[34m', text, '\033[0m', sep='') 
def magenta(text):
     print('\033[35m', text, '\033[0m', sep='')  
def cyan(text):
     print('\033[36m', text, '\033[0m', sep='')  
def gray(text):
     print('\033[90m', text, '\033[0m', sep='')

data source: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [3]:
df = pd.read_csv ('/home/wojciech/Pulpit/6/Breast_Cancer_Wisconsin.csv')
green(df.shape)
df.head(3)


FileNotFoundError: [Errno 2] File b'/home/wojciech/Pulpit/6/Breast_Cancer_Wisconsin.csv' does not exist: b'/home/wojciech/Pulpit/6/Breast_Cancer_Wisconsin.csv'

### Deleting unneeded columns

In [None]:
del df['Unnamed: 32']
del df['diagnosis']
del df['id']

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns

sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

### Deletes duplicates
there were no duplicates

In [None]:
green(df.shape)
df.drop_duplicates(keep='first', inplace=True)
blue(df.shape)

In [None]:
blue(df.dtypes)

In [None]:
df.columns

### We choose the continuous variable - compactness_mean


In [None]:
print('max:',df['compactness_mean'].max())
print('min:',df['compactness_mean'].min())

sns.distplot(np.array(df['compactness_mean']))

# Pearson correlation

In [None]:
def matrix_plot(df,title):

    sns.set(style="ticks")

    corr = df.corr()
    corr = np.round(corr, decimals=2)


    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    f, ax = plt.subplots(figsize=(20, 20))
    #cmap = sns.diverging_palette(580, 10, as_cmap=True)
    cmap = sns.diverging_palette(180, 90, as_cmap=True) #Inna paleta barw

    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=0.3, center=0.03,annot=True,
                square=True, linewidths=.9, cbar_kws={"shrink": 0.8})
    plt.xticks(rotation=90)
    plt.title(title,fontsize=32,color='#0c343d',alpha=0.5)
    plt.show

In [None]:
matrix_plot(df,'Pearson correlation')

### Correlation to the result variable

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
CORREL = df.corr().sort_values('compactness_mean')
CORREL['compactness_mean'].plot(kind='barh',color='#0c343d',alpha=0.5)
plt.title('Correlation to the result variable', fontsize=20)
plt.xlabel('Correlation level')
plt.ylabel('Continuous independent variables')

### I find variables that are highly correlated with the result variable

In [None]:
kot = abs(CORREL['compactness_mean'])
FAT = kot[kot>=0.7]
FAT

### Compares variables in pairs

In [None]:
plt.barh(*zip(*FAT.items()),color='#0c343d',alpha=0.5) 
plt.xticks(rotation=90)

### High autocorrelation chart

In [None]:
CORR = df.corr()

kot = CORR[CORR>=.9]
plt.figure(figsize=(6,4))
sns.heatmap(kot, cmap="Greens")

## Deleting correlated independent variables
The code we compare the correlation between variables and remove one of two features whose correlation is higher than 0.9

In [None]:
corr = df.corr()
kot = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if kot[j]:
                kot[j] = False
selected_columns = df.columns[kot]
df2 = df[selected_columns]

In [None]:
kot   #<== PĘTLA ZROBIŁA NAM wektor 31 elementów True- False

### Dimensions have been reduced

In [None]:
blue(df.shape)
green(df2.shape)

### OLS linear regression model for variables before reduction

In [None]:
blue(df.shape)
green(df2.shape)

In [None]:
X1 = df.drop('compactness_mean', axis=1) 
y1 = df['compactness_mean']  

In [None]:
from statsmodels.formula.api import ols
import statsmodels.api as sm

model = sm.OLS(y1, sm.add_constant(X1))
model_fit = model.fit()

print('R2: %.6f' % model_fit.rsquared)
#blue(model_fit.summary())

### OLS linear regression model for variables after reduction

In [None]:
X2 = df2.drop('compactness_mean', axis=1) 
y2 = df2['compactness_mean']  

In [None]:
from statsmodels.formula.api import ols
import statsmodels.api as sm

model = sm.OLS(y2, sm.add_constant(X2))
model_fit = model.fit()

print('R2: %.6f' % model_fit.rsquared)
#blue(model_fit.summary())
red('The reduction of dimensions caused the deterioration of the models properties')

## Eliminates variables previously selected in the FAT procedure

In [None]:
FAT

In [None]:
df3 = df.drop(['compactness_se','concave points_worst','concavity_worst','concave points_mean','compactness_worst','concavity_mean'],1)

In [None]:
X3 = df3.drop('compactness_mean', axis=1) 
y3 = df3['compactness_mean']  

In [None]:
from statsmodels.formula.api import ols
import statsmodels.api as sm

model = sm.OLS(y3, sm.add_constant(X3))
model_fit = model.fit()

print('R2: %.6f' % model_fit.rsquared)
#blue(model_fit.summary())
red('The reduction of dimensions caused the deterioration of the models properties')