In [1]:
%pylab
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd 
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


In [8]:
 class TelcoChurn:
    """
    Prints summary statistics of columns, train and predict the model, get random 1000 rows
    """
    churn_df = None
    churn_cleaned = None
    numerics= None
    col_names=None
    churn_df_numeric = None
    churn_df_nonnumeric=None
    churn_numeric=None #cleaned
    churn_non_numeric = None # cleaned
    
    
    def __init__(self):
        
        self.churn_df = pd.read_excel('/Users/rameshthulasiram/Documents/yams-python_ucsc/final project/Telco_customer_churn_dataset.xlsx', 'sheet1')
    
        self.numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        self.churn_df_numeric = self.churn_df.select_dtypes(include=self.numerics)
        self.churn_df_nonnumeric = self.churn_df.select_dtypes(exclude=self.numerics)
        
    def dataset_initial_understanding(self):
        print "Column names:"
        self.col_names= self.churn_df.columns.tolist()
        print self.col_names
        print "\nSample data:"
        print self.churn_df.head(6)   
        print str(self.churn_df)
    
    def preprocess_data(self):
        self.churn_df['TotalCharges'] = self.churn_df['TotalCharges'].map(lambda x: '' if (type(x) != type(1) and type(x) != type(0.5)) else x)
        self.remove_rows = self.churn_df[self.churn_df['TotalCharges'] == '']
        #remove_rows
        self.churn_cleaned = self.churn_df[self.churn_df['TotalCharges'] != '']
        #churn_cleaned
        self.churn_cleaned['TotalCharges'] = self.churn_cleaned['TotalCharges'].astype('float64')
        self.churn_numeric = self.churn_cleaned.select_dtypes(include=self.numerics)
        #churn_df_numeric
        self.churn_non_numeric= self.churn_cleaned.select_dtypes(exclude=self.numerics)
        print self.churn_cleaned.columns.tolist()
        

        
    def summary_statistics(self):
        for column_name in self.churn_cleaned:
            description = self.churn_cleaned[column_name].describe()
            if 'unique' in description and description['unique'] <= 10:
                description['unique_values'] = unique(self.churn_cleaned[column_name].get_values())
                description['value_counts'] = self.churn_cleaned[column_name].value_counts().to_dict()
                print description['unique_values']
                print description['value_counts']
                
               
       
    
    def histogram(self):
        """Save the Histogram for a numerical column, default 100 bins
        """
        
        #if column_name in self.churn_nonnumeric:
        #    raise Exception('Cannot generate histogram for non-numeric data')
        for column_name in self.churn_numeric:
            figure(figsize=(12, 9))
            ax = subplot(111)
            ax.spines["top"].set_visible(False)
            ax.spines["right"].set_visible(False)
            ax.get_xaxis().tick_bottom()
            ax.get_yaxis().tick_left()
            xticks(fontsize=14)
            yticks(range(5000, 30001, 5000), fontsize=14)
            xlabel(column_name, fontsize=16)
            ylabel("Count", fontsize=16)

            hist(list(self.churn_cleaned[column_name].values) + list(self.churn_cleaned[column_name].values),
                color="#3F5D7D", bins=100)
        #text(1300, -5000, "Data source: Lending Circle | "
               #"Author: Ramesh Thulasiram", fontsize=10)
        #savefig(filename or (column_name+'.png'), bbox_inches="tight")


    def bar_chart(self):
        """ the bar chart- to plot categorical variables"""
        #if column_name in self.churn_numeric:
         #   raise Exception('Cannot generate bar chart for numeric data')
        for column_name in self.churn_non_numeric:
            bar_dict = self.churn_non_numeric[column_name].value_counts().to_dict()
            figure()
            ax = subplot(111)
            ax.spines["top"].set_visible(False)
            ax.spines["right"].set_visible(False)
            ax.get_xaxis().tick_bottom()
            ax.get_yaxis().tick_left()
            #xticks(fontsize=14)
            yticks(range(5000, 30001, 5000), fontsize=14)
            xlabel(column_name, fontsize=16)
            ylabel("Count", fontsize=16)
            width = 0.35
            bar(arange(len(bar_dict.keys())), bar_dict.values(), width,  color="#3F5D7D")
            xticks(arange(len(bar_dict.keys()))+width/2., bar_dict.keys() )
        #text(1300, -5000, "Data source: Lending Circle | "
        #       "Author: Ramesh Thulasiram", fontsize=10)
       
    
    def train_and_predict(self):
        drop_columns = ['customerID']
        categoricals = ["gender","SeniorCitizen", "Partner", "Dependents","PhoneService", "MultipleLines", "InternetService",
                "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport",
                "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling","PaymentMethod"]
        
### Data cleansing
#self.convert_categoricals(categoricals)
        for cat in categoricals:
            self.churn_cleaned[cat] = self.churn_cleaned[cat].astype('category')
            
# Split test & training data as 1:3
        self.churn_cleaned['is_train'] = np.random.uniform(0, 1, len(self.churn_cleaned)) <= .75
        self.train,self.test = self.churn_cleaned[self.churn_cleaned['is_train']==True], self.churn_cleaned[self.churn_cleaned['is_train']==False]

# Convert loan_status to a format that we can use to train
        labels_train = (self.churn_cleaned[self.churn_cleaned['is_train']==True])[['Churn']]
        labels_test = (self.churn_cleaned[self.churn_cleaned['is_train']==False])[['Churn']]
        le = preprocessing.LabelEncoder()
        dv = DictVectorizer(sparse=False)
        labels_train = le.fit_transform(labels_train)
        labels_test = le.transform(labels_test)


 # Keep only categorical data here
        categorical_view = self.churn_cleaned.drop(list( (set(self.churn_cleaned.columns) - set(categoricals)) ) , axis=1)
        del self.churn_cleaned['Churn']
        
# Generate categorical training & test data
        categorical_train_as_dicts = [dict(r.iteritems()) for _, r in categorical_view[self.churn_cleaned['is_train']==True].iterrows()]
        categorical_train_fea = dv.fit_transform(categorical_train_as_dicts)
        categorical_test_as_dicts = [dict(r.iteritems()) for _, r in categorical_view[self.churn_cleaned['is_train']==False].iterrows()]
        categorical_test_fea = dv.transform(categorical_test_as_dicts)

 # Generate numerical training & test data
        numerical_train = self.churn_cleaned[self.churn_cleaned['is_train']==True].drop(list( set(drop_columns) | set(categoricals) | set(['is_train'])), axis=1)
        numerical_train_fea = numerical_train.as_matrix()
        numerical_test = self.churn_cleaned[self.churn_cleaned['is_train']==False].drop(list( set(drop_columns) | set(categoricals) | set(['is_train'])), axis=1)
        numerical_test_fea = numerical_test.as_matrix()
        train_fea = np.concatenate( (categorical_train_fea, numerical_train_fea), axis=1)
        test_fea = np.concatenate( (categorical_test_fea, numerical_test_fea), axis=1)

# Use a random forest classifier
        clf = RandomForestClassifier(n_jobs=10)
        clf.fit(train_fea, labels_train)
        
# Predict now for test data
        label_predictions = le.inverse_transform(clf.predict(test_fea).astype('I'))

    
# Some output of how we did
        print "Predictions"
        print label_predictions
        print "Original Test set labels"
        print le.inverse_transform(labels_test)
        orig = le.inverse_transform(labels_test).flatten()
        pred = label_predictions
        counted = (pred==orig)
        print sum(counted==True)

        print sum(counted==False)

        print sum(counted==False)/ (sum(counted==True) + sum(counted==False))


        # Misclassification rate

        
        print confusion_matrix( orig, pred)
        label =["Yes", "No"]
        print label
        cm = confusion_matrix( orig, pred, label)
        print(cm)
        
        
tc= TelcoChurn()

'''
tc.dataset_initial_understanding()
tc.preprocess_data()
tc.train_and_predict()
tc.histogram()
tc.bar_chart()
tc.summary_statistics()
'''

        


'\ntc.dataset_initial_understanding()\ntc.preprocess_data()\ntc.train_and_predict()\ntc.histogram()\ntc.bar_chart()\ntc.summary_statistics()\n'

In [9]:
tc.dataset_initial_understanding()

Column names:
[u'customerID', u'gender', u'SeniorCitizen', u'Partner', u'Dependents', u'tenure', u'PhoneService', u'MultipleLines', u'InternetService', u'OnlineSecurity', u'OnlineBackup', u'DeviceProtection', u'TechSupport', u'StreamingTV', u'StreamingMovies', u'Contract', u'PaperlessBilling', u'PaymentMethod', u'MonthlyCharges', u'TotalCharges', u'Churn']

Sample data:
   customerID  gender SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female            No     Yes         No       1           No   
1  5575-GNVDE    Male            No      No         No      34          Yes   
2  3668-QPYBK    Male            No      No         No       2          Yes   
3  7795-CFOCW    Male            No      No         No      45           No   
4  9237-HQITU  Female            No      No         No       2          Yes   
5  9305-CDSKC  Female            No      No         No       8          Yes   

      MultipleLines InternetService OnlineSecurity  ...  DeviceProtection

In [4]:
tc.preprocess_data()

[u'customerID', u'gender', u'SeniorCitizen', u'Partner', u'Dependents', u'tenure', u'PhoneService', u'MultipleLines', u'InternetService', u'OnlineSecurity', u'OnlineBackup', u'DeviceProtection', u'TechSupport', u'StreamingTV', u'StreamingMovies', u'Contract', u'PaperlessBilling', u'PaymentMethod', u'MonthlyCharges', u'TotalCharges', u'Churn']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
tc.summary_statistics()

[u'Female' u'Male']
{u'Male': 3549, u'Female': 3483}
[u'No' u'Yes']
{u'Yes': 1142, u'No': 5890}
[u'No' u'Yes']
{u'Yes': 3393, u'No': 3639}
[u'No' u'Yes']
{u'Yes': 2099, u'No': 4933}
[u'No' u'Yes']
{u'Yes': 6352, u'No': 680}
[u'No' u'No phone service' u'Yes']
{u'Yes': 2967, u'No phone service': 680, u'No': 3385}
[u'DSL' u'Fiber optic' u'No']
{u'Fiber optic': 3096, u'DSL': 2416, u'No': 1520}
[u'No' u'No internet service' u'Yes']
{u'Yes': 2015, u'No internet service': 1520, u'No': 3497}
[u'No' u'No internet service' u'Yes']
{u'Yes': 2425, u'No internet service': 1520, u'No': 3087}
[u'No' u'No internet service' u'Yes']
{u'Yes': 2418, u'No internet service': 1520, u'No': 3094}
[u'No' u'No internet service' u'Yes']
{u'Yes': 2040, u'No internet service': 1520, u'No': 3472}
[u'No' u'No internet service' u'Yes']
{u'Yes': 2703, u'No internet service': 1520, u'No': 2809}
[u'No' u'No internet service' u'Yes']
{u'Yes': 2731, u'No internet service': 1520, u'No': 2781}
[u'Month-to-month' u'One year' 

In [None]:
tc.histogram()

In [None]:
tc.bar_chart()

In [6]:
tc.train_and_predict()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Predictions
[u'No' u'Yes' u'Yes' ..., u'No' u'No' u'Yes']
Original Test set labels
[[u'No']
 [u'Yes']
 [u'Yes']
 ..., 
 [u'No']
 [u'No']
 [u'Yes']]
1419
407
0.222891566265
[[1216  134]
 [ 273  203]]
['Yes', 'No']
[[ 203  273]
 [ 134 1216]]


  y = column_or_1d(y, warn=True)


NameError: name 'self' is not defined

In [None]:
pip freeze
pip freeze > requirements.txt

mention what you are using out  of 12 requirements. 