# Predicting For New Data
<ul>
    <li>Create or get new data</li>
    <li>Train model object </li>
    <li>Use trained model object to predict new data</li>
</ul>

In [2]:
import pandas as pd # Python pandas package is used for data manipulation and analysis
import numpy as np # Numpy is library for python used for multidimensional array and matrices and mathematical functions to operate on these array

In [3]:
#read csv file directory from my pc
telecom = pd.read_csv('E:\\Telecom_data.csv')
print(telecom)

     State  Account Length  Area Code     Phone Int'l Plan VMail Plan  \
0       KS             128        415  382-4657         no        yes   
1       OH             107        415  371-7191         no        yes   
2       NJ             137        415  358-1921         no         no   
3       OH              84        408  375-9999        yes         no   
4       OK              75        415  330-6626        yes         no   
5       AL             118        510  391-8027        yes         no   
6       MA             121        510  355-9993         no        yes   
7       MO             147        415  329-9001        yes         no   
8       LA             117        408  335-4719         no         no   
9       WV             141        415  330-8173        yes        yes   
10      IN              65        415  329-6603         no         no   
11      RI              74        415  344-9403         no         no   
12      IA             168        408  363-1107    

In [4]:
import random
random.seed(1)


In [5]:
""" random sample records from existing data, this will be used as new data """
new_telecom_data = telecom.sample(333)

In [6]:
new_telecom_data.shape
print(new_telecom_data)

     State  Account Length  Area Code     Phone Int'l Plan VMail Plan  \
2299    MN             100        415  327-8732         no        yes   
336     SC              99        510  397-4304         no         no   
1135    ID             144        415  402-3476         no        yes   
1121    MI              91        415  390-7930         no         no   
1099    HI             157        415  333-7961         no         no   
489     IA             130        415  361-5277         no         no   
2340    ID             101        510  406-4768         no        yes   
1458    MD             125        408  349-6464         no         no   
93      ME              78        415  400-9510         no         no   
886     DC             103        510  386-2317         no        yes   
2385    NM             132        408  405-3848         no         no   
377     ND              79        408  363-3515         no         no   
424     DC             112        415  394-5537    

In [7]:
""" we remove sample data from original dataset """
remove_rows = telecom.index.isin(new_telecom_data.index)
train_telecom_data = telecom[~remove_rows]

In [8]:
train_telecom_data.shape
print(train_telecom_data)

     State  Account Length  Area Code     Phone Int'l Plan VMail Plan  \
0       KS             128        415  382-4657         no        yes   
1       OH             107        415  371-7191         no        yes   
2       NJ             137        415  358-1921         no         no   
3       OH              84        408  375-9999        yes         no   
5       AL             118        510  391-8027        yes         no   
6       MA             121        510  355-9993         no        yes   
7       MO             147        415  329-9001        yes         no   
8       LA             117        408  335-4719         no         no   
9       WV             141        415  330-8173        yes        yes   
10      IN              65        415  329-6603         no         no   
11      RI              74        415  344-9403         no         no   
12      IA             168        408  363-1107         no         no   
13      MT              95        510  394-8006    

In [9]:
""" 3000 rows to train model, 333 rows to predict as new data and check performance """
# target data
telecom_result = train_telecom_data['Churn?']
y = np.where(telecom_result == 'True.', 1,0)
# we dont need these columns
to_drop = ['State','Area Code','Phone','Churn?']
telecom_feat_space = train_telecom_data.drop(to_drop, axis = 1)
# yes/no has to be converted into boolean values
#  convert these from boolean to 1 and 0
yes_no_cols = ["Int'l Plan", "VMail Plan"]
telecom_feat_space[yes_no_cols] = telecom_feat_space[yes_no_cols] == 'yes'
print('There are {} instances for churn class and {} instances for not-churn classes.'.format(y.sum(), y.shape[0] - y.sum()))
features = telecom_feat_space.columns
print(features)
X = telecom_feat_space.as_matrix().astype(np.float)
# this is important standardization of data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print("Feature space holds %d observations and %d features" % X.shape)
print("unique target labels:", np.unique(y))

There are 432 instances for churn class and 2568 instances for not-churn classes.
Index(['Account Length', 'Int'l Plan', 'VMail Plan', 'VMail Message',
       'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls',
       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins',
       'Intl Calls', 'Intl Charge', 'CustServ Calls'],
      dtype='object')
Feature space holds 3000 observations and 17 features
unique target labels: [0 1]


In [12]:
from sklearn.ensemble import RandomForestClassifier as RF
""" creating  model object"""
random_forest_obj = RF(n_estimators=100, max_depth=3, class_weight="balanced")

In [13]:
"""training model object"""
random_forest_obj.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=3, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Feature Extraction for New Data

In [14]:
"""We use the excluded sample as new data, and perform feature engineering"""
# target data
telecom_result = new_telecom_data['Churn?']
y = np.where(telecom_result == 'True.', 1,0)
# we dont need these columns
to_drop = ['State','Area Code','Phone','Churn?']
telecom_feat_space = new_telecom_data.drop(to_drop, axis = 1)
# yes/no has to be converted into boolean values
#  convert these from boolean to 1 and 0
yes_no_cols = ["Int'l Plan", "VMail Plan"]
telecom_feat_space[yes_no_cols] = telecom_feat_space[yes_no_cols] == 'yes'
print('There are {} instances for churn class and {} instances for not-churn classes.'.format(y.sum(), y.shape[0] - y.sum()))
features = telecom_feat_space.columns
print(features)
X = telecom_feat_space.as_matrix().astype(np.float)
# this is important standardization of data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
print("Feature space holds %d observations and %d features" % X.shape)
print("unique target labels:", np.unique(y))

There are 51 instances for churn class and 282 instances for not-churn classes.
Index(['Account Length', 'Int'l Plan', 'VMail Plan', 'VMail Message',
       'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls',
       'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins',
       'Intl Calls', 'Intl Charge', 'CustServ Calls'],
      dtype='object')
Feature space holds 333 observations and 17 features
unique target labels: [0 1]


In [15]:
"""We predict new data"""
predicted_labels = random_forest_obj.predict(X)

In [16]:
new_telecom_data.shape

(333, 21)

In [17]:
"""These are the predicted labels"""
print(predicted_labels)


[0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0
 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0
 0 0 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0
 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 1 0]


In [20]:
from sklearn.metrics import accuracy_score
"""check accuracy of new predictions"""
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)
print("Accuracy of Random Forest on new data")
print("%.3f" % accuracy(y, predicted_labels))

Accuracy of Random Forest on new data
0.856


## Pickle 

In [21]:
""" The train model file can be written to file """
import pickle

with open("model_object.pkl", "wb") as file_handle:
    pickle.dump(random_forest_obj, file_handle)

In [22]:
""" The saved model object can be read into memory and used any where"""
import pickle
with open("model_object.pkl", "rb") as file_handle:
    read_model_obj = pickle.load(file_handle)

""" we can use the read model for making predictions """
pred_results = read_model_obj.predict(X)

In [23]:
pred_results

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0,