# Exploring the case data

Let's start by loading the dataset:

In [1]:
##### added line to ensure plots are showing
%matplotlib inline
#####

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Use this code if you need large figures (on high-resolution screens)
# For bigger pictures:
#plt.rcParams["figure.dpi"] = 144

np.random.seed(40)

data = pd.read_csv("churn.csv",sep=',',index_col=0)

y = data['Churn']
X = data.drop('Churn',axis=1)

# Outcomment if you want to see all the variables first
#print(data.describe(include='all'))

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif
from sklearn.feature_selection import SelectKBest
from scipy.stats import chi2_contingency
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Transforming the variables

First, you need to check which variables need to be transformed into numerical form so we can process them with the appropriate feature selection techniques. 

TIP: You don't need to include the dependent variable 'Churn' as we will do this later. Check the dataset, and use the following function to fill in your answers:

### Finding the variables

In [None]:
# the list should contain the variable's string name (as found in X.colummns)
def return_variables_to_be_converted_or_removed():
    to_convert = []
    to_remove = []
    
    feature_names = X.columns
    to_convert.extend(feature_names[[1,3,4]])
    to_convert = to_convert
    to_remove.append(feature_names[2])
    
    return to_convert, to_remove

X_copy = X.copy()
return_variables_to_be_converted_or_removed()
to_convert, to_remove = return_variables_to_be_converted_or_removed()

In [3]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


### Coverting and removing variables

Now, do the actual conversion and removal. Make sure your X matrix remains a pandas dataframe, and that you use pandas to do the conversion as well (use the variable's name as prefix and the uniques values to label the new variables):

In [4]:
def convert_and_remove_categorical_variables(X, to_convert, to_remove):
        
    for i in to_remove:        
        X = X.drop(columns = i)
    X_1 = pd.get_dummies(X, columns = ['Area_Code','International_Plan', 'Voice_mail_Plan'], drop_first = True)
    
    return X_1

X = convert_and_remove_categorical_variables(X_copy, to_convert, to_remove)
convert_and_remove_categorical_variables(X_copy, to_convert, to_remove)

Verify your answer:

In [5]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


As promised, we will do the dependent variable separately:

In [6]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()    
y = encoder.fit_transform(y)

## Filter methods

Now we are good to go. Let's now discover what the filter methods are telling us. Implement various filter methods and, based on their feedback, you can propose which variables you think are the most and least important:

Now, based on using a few feature selection techniques, mutual information and $\chi^2$, which variables would you remove, and which ones would you retain?

Remove all the ones that are below the mean of all scores of the features for the certain metric, and retain the ones that are above the mean, even for just one metric. Make sure you normalise first.

In [7]:
# the sets should contain the variable's string name (as found in X.colummns)
def return_variables_to_be_retained_or_removed(X_converted):
    to_retain = set()           
    to_remove = set()
    from sklearn.feature_selection import chi2, mutual_info_regression, mutual_info_classif
    from sklearn.feature_selection import SelectKBest
    from scipy.stats import chi2_contingency
    from sklearn.preprocessing import normalize
    from sklearn.preprocessing import StandardScaler
    np.random.seed(40)
    print("Chi-squared:")
    X_2 = normalize(X_converted)
    
    selected_features = SelectKBest(chi2, k=18).fit(X_2, y)
    average_chi2 = np.average(selected_features.scores_)


    m = 0
    selected_features_column_no = []
    for i in selected_features.scores_:
        if i > average_chi2:
            selected_features_column_no.append(m)
        m += 1

    print("Mutual information:")
    # Mutual information (for regression)
    selected_features_info = SelectKBest(mutual_info_classif, k=18).fit(X_2, y)
    average_mutual_info = np.average(selected_features_info.scores_)

    n = 0
    selected_features_column_no_1 = []
    for i in selected_features_info.scores_:
        if i > average_mutual_info:
            selected_features_column_no_1.append(n)
        n += 1

    column_names = X_converted.columns
    selected_features_chi2 = []
    selected_features_mutualinfo = []
    for i in selected_features_column_no:
        selected_features_chi2.append(column_names[i])
    for i in selected_features_column_no_1:
        selected_features_mutualinfo.append(column_names[i])

    to_retain = (set(selected_features_chi2) | set(selected_features_mutualinfo))
    to_remove = set(np.setdiff1d(X_converted.columns, list(to_retain)))

    return to_retain



In [8]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


## PCA 

Next, let's see what PCA tells us:

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

X_norm = normalize(X)
pca = PCA()
pca.fit(X_norm)

plt.plot(range(1,len(pca.components_)+1),pca.explained_variance_ratio_,'-o')
plt.xlabel('components')
plt.ylabel('% explained variance')
plt.title("Scree plot")
plt.show()

Check, for the 5 most interesting components, which variables have a weight higher than 0.1:

### Select based on PCA

In [None]:
# the set should contain the variable's string name (as found in X.colummns)
def find_pca_variables(X_converted):
    interesting_variables = set()
    
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import normalize
    threshold = 0.1
    length_pca_component = 5
    X_norm = normalize(X_converted)
    pca = PCA()
    pca.fit(X_norm)
    component = pca.components_[0:length_pca_component]

    variables = []
    weights = []
    weights1 = []
    
    feature_names = X_converted.columns

    for i in range(len(component)):
        k=0
        for j in (component[i]):
            
            if j > threshold or j < -threshold:
                interesting_variables.add(feature_names[k])            
            k+=1
        
    return interesting_variables
    

In [None]:
find_pca_variables(X.copy())

In [None]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###
