<font size=5>1.Prepare a data quality report for the CSV file.</font>

In [157]:
# Import pandas, numpy and matplotlib libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
%matplotlib inline

In [158]:
mydata = pd.read_csv('CustomerChurn-16213170.csv',  keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)

In [159]:
#Check how many rows and columns your CSV has.
mydata.shape

(1000, 33)

In [160]:
#Print the first 5 rows.
mydata.head(5)

Unnamed: 0,customer,age,occupation,regionType,marriageStatus,children,income,numHandsets,handsetAge,smartPhone,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
0,1046755,48,professional,suburban,yes,True,7,1,421,True,...,69.33,1.867778,-11.700647,2.0,14,0.0,1,0,0,True
1,1023345,56,professional,suburban,yes,True,7,2,587,True,...,0.0,1.749064,13.662191,0.0,21,0.67,0,0,0,False
2,1067602,34,,,no,False,1,1,337,True,...,1.0,3.174762,19.364133,2.0,11,0.67,0,0,2,True
3,1072994,36,,,yes,False,7,1,240,True,...,4.33,0.835729,-5.01055,3.67,8,16.67,1,0,2,False
4,1040228,0,,town,unknown,False,0,1,484,True,...,0.0,1.73571,-7.030517,1.0,16,1.67,0,0,0,False


In [161]:
#Print the first the last 5 rows.
mydata.tail(5)
mydata.dtypes

customer                        int64
age                             int64
occupation                     object
regionType                     object
marriageStatus                 object
children                         bool
income                          int64
numHandsets                     int64
handsetAge                      int64
smartPhone                       bool
currentHandsetPrice           float64
creditRating                   object
homeOwner                        bool
creditCard                     object
avgBill                       float64
avgMins                       float64
avgrecurringCharge            float64
avgOverBundleMins             float64
avgRoamCalls                  float64
callMinutesChangePct          float64
billAmountChangePct           float64
avgReceivedMins               float64
avgOutCalls                   float64
avgInCalls                    float64
peakOffPeakRatio              float64
peakOffPeakRatioChangePct     float64
avgDroppedCa

In [162]:
# Select columns containing categorical data
categorical_columns = mydata[['occupation','regionType','marriageStatus','income','creditRating', 'creditCard', 'children', 'smartPhone', 'homeOwner', 'churn']].columns

# Convert data type to category for these columns
for column in categorical_columns:
    mydata[column] = mydata[column].astype('category')
    
mydata.dtypes

customer                         int64
age                              int64
occupation                    category
regionType                    category
marriageStatus                category
children                      category
income                        category
numHandsets                      int64
handsetAge                       int64
smartPhone                    category
currentHandsetPrice            float64
creditRating                  category
homeOwner                     category
creditCard                    category
avgBill                        float64
avgMins                        float64
avgrecurringCharge             float64
avgOverBundleMins              float64
avgRoamCalls                   float64
callMinutesChangePct           float64
billAmountChangePct            float64
avgReceivedMins                float64
avgOutCalls                    float64
avgInCalls                     float64
peakOffPeakRatio               float64
peakOffPeakRatioChangePct

In [163]:
mydata.select_dtypes(['category']).describe().T

Unnamed: 0,count,unique,top,freq
occupation,274,7,professional,174
regionType,521,7,suburban,333
marriageStatus,1000,3,yes,385
children,1000,2,False,748
income,1000,10,0,264
smartPhone,1000,2,True,902
creditRating,1000,7,B,361
homeOwner,1000,2,False,643
creditCard,1000,6,true,642
churn,1000,2,False,504


In [164]:
# Checking for duplicated rows
print(mydata.set_index('customer').index.get_duplicates())

[]


In [165]:
# Checking for duplicated columns
print(mydata.set_index('customer').index.T.get_duplicates())

[]


In [166]:
# Select columns containing continuous data
continuous_columns = []
for column in mydata.columns.values:
    if column not in categorical_columns:
        continuous_columns.append(column)
mydata[continuous_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer,1000.0,1049907.0,29377.892646,1000022.0,1023521.0,1049796.0,1075653.0,1099988.0
age,1000.0,29.85,22.430261,0.0,0.0,34.0,48.0,90.0
numHandsets,1000.0,1.862,1.341923,1.0,1.0,1.0,2.0,13.0
handsetAge,1000.0,389.972,260.18374,-2.0,206.75,331.0,533.0,1599.0
currentHandsetPrice,1000.0,37.12558,57.391136,0.0,0.0,0.0,59.99,299.99
avgBill,1000.0,58.70876,40.375648,0.0,33.695,49.71,73.7025,469.67
avgMins,1000.0,535.9519,540.746586,0.0,158.75,372.0,736.1875,4530.75
avgrecurringCharge,1000.0,46.00562,22.770386,0.0,30.0,44.99,59.1925,169.99
avgOverBundleMins,1000.0,42.2615,92.940958,0.0,0.0,3.5,45.75,1389.0
avgRoamCalls,1000.0,1.03898,4.073257,0.0,0.0,0.0,0.16,62.97


In [167]:
#- Drop constant columns, if any.
columns = mydata.columns.values
for col in columns:
    # drop col if unique values is 1
    if mydata[col].nunique(dropna=False) == 1:
        del mydata[col]

In [168]:
#Save your updated/cleaned data frame to a new csv file.
mydata.to_csv('ready1.csv', index=False)

In [169]:
clean_data = pd.read_csv('ready1.csv')

In [170]:
# Check for irregular cardinality in categorical features. There could be same values spelled differently
for column in categorical_columns:
    print("Unique values for " + column + ":", pd.unique(clean_data[column].ravel()))

Unique values for occupation: ['professional' nan 'crafts' 'self-employed' 'clerical' 'homemaker'
 'retired' 'student']
Unique values for regionType: ['suburban' nan 'town' 'rural' 't' 'unknown' 's' 'r']
Unique values for marriageStatus: ['yes' 'no' 'unknown']
Unique values for income: [7 1 0 4 2 5 6 9 3 8]
Unique values for creditRating: ['E' 'B' 'A' 'D' 'F' 'C' 'G']
Unique values for creditCard: ['true' 'false' 't' 'f' 'no' 'yes']
Unique values for children: [ True False]
Unique values for smartPhone: [ True False]
Unique values for homeOwner: [False  True]
Unique values for churn: [ True False]


In [171]:
# Check whether there are null values in the data where values would be expected
clean_data.isnull().sum()

customer                        0
age                             0
occupation                    726
regionType                    479
marriageStatus                  0
children                        0
income                          0
numHandsets                     0
handsetAge                      0
smartPhone                      0
currentHandsetPrice             0
creditRating                    0
homeOwner                       0
creditCard                      0
avgBill                         0
avgMins                         0
avgrecurringCharge              0
avgOverBundleMins               0
avgRoamCalls                    0
callMinutesChangePct            0
billAmountChangePct             0
avgReceivedMins                 0
avgOutCalls                     0
avgInCalls                      0
peakOffPeakRatio                0
peakOffPeakRatioChangePct       0
avgDroppedCalls                 0
lifeTime                        0
lastMonthCustomerCareCalls      0
numRetentionCa

In [172]:
#Replace the conflicting values name in  creditCard.
clean_data['creditCard'].replace('t', 'true', inplace=True) 
clean_data['creditCard'].replace('f', 'false', inplace=True)
clean_data['creditCard'].replace('no', 'false', inplace=True) 
clean_data['creditCard'].replace('yes', 'true', inplace=True) 
print("Unique values forcreditCard:", pd.unique(clean_data['creditCard'].ravel()))

Unique values forcreditCard: ['true' 'false']


In [173]:
#It seems that the empty values of occupation take 72.6%. So I decide to delete it.
clean_data = clean_data.drop('occupation', 1)

In [174]:
#It seems that the empty values of occupation take 47.9%. So I decide to delete it.
clean_data = clean_data.drop('regionType', 1)

In [175]:
clean_data = clean_data.drop('customer', 1)

In [176]:
#It seems that the empty values of occupation take 55.8% percent. So I decide to delete it.
clean_data[(clean_data['currentHandsetPrice']==0)]['currentHandsetPrice'].count()

558

In [177]:
clean_data = clean_data.drop('currentHandsetPrice', 1)

In [178]:
#说明为替换age

In [179]:
#for age
#The number of people whose age is 0 are 307. It is impossible, and it should be treaded as missing value.
clean_data[(clean_data['age']==0)]['age'].count()

307

In [180]:
print("The middle age is that will be used is: ", clean_data['age'].median())
clean_data['age'].replace(0, clean_data['age'].median(), inplace=True)
clean_data[(clean_data['age']==0)]['age'].count()

The middle age is that will be used is:  34.0


0

In [181]:
# It is impossible to fix the missing data, and it is only 38% missing data, I cannot drop it. so I replace it with NaN.
clean_data[(clean_data['marriageStatus']=='unknown')]['marriageStatus'].count()

382

In [182]:
#说明为什么删除marriagestatus

In [183]:
clean_data = clean_data.drop('marriageStatus', 1)

In [184]:
#for income
#The number of people who have 0 income are 264. It seem not so reasonable, but they are possible, for example, students
#  who have mobile and 0 income.
#Therefore, it is not possible to understand whether the data is correct or not, and I decide to keep it.
clean_data[(clean_data['income']==0)]

Unnamed: 0,age,children,income,numHandsets,handsetAge,smartPhone,creditRating,homeOwner,creditCard,avgBill,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
4,34,False,0,1,484,True,E,True,false,47.93,...,0.00,1.735710,-7.030517,1.00,16,1.67,0,0,0,False
20,34,False,0,1,371,True,C,True,false,42.72,...,0.00,2.017589,-6.880538,5.67,13,4.67,0,0,0,True
22,34,False,0,1,386,True,E,True,false,69.60,...,26.67,1.105664,18.213889,31.33,13,20.67,0,0,0,True
32,34,False,0,2,18,True,C,True,false,34.99,...,0.00,4.500000,-17.347207,2.00,7,0.00,0,0,0,True
35,34,False,0,2,40,True,C,True,false,149.80,...,43.67,0.629730,-18.329509,7.00,8,0.67,0,0,0,False
36,42,False,0,1,383,True,G,True,false,60.82,...,16.67,1.289157,9.409143,8.67,13,1.00,0,0,0,False
40,34,False,0,1,199,True,C,True,false,98.53,...,4.00,0.786283,-7.077718,13.33,7,5.00,0,0,0,True
45,34,False,0,1,312,True,C,True,false,112.67,...,24.33,12.723657,-14.739520,6.33,10,0.00,0,0,0,True
46,34,False,0,1,349,True,G,True,false,50.89,...,4.00,0.584286,-7.267449,13.00,12,0.00,0,0,0,True
47,34,False,0,2,98,True,B,True,false,134.66,...,3.67,1.178275,10.609530,16.67,11,1.33,0,0,1,False


In [185]:
#It is reasonable, they are "churn"
clean_data[(clean_data['avgBill']==0)]

Unnamed: 0,age,children,income,numHandsets,handsetAge,smartPhone,creditRating,homeOwner,creditCard,avgBill,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
226,34,False,6,1,773,True,A,False,True,0.0,...,0.0,1.197605,13.684288,0.0,25,0.0,0,0,0,True
452,60,True,3,3,674,False,D,False,True,0.0,...,0.0,0.0,-0.003758,0.0,34,0.0,0,0,0,True
481,34,False,0,2,323,True,F,True,False,0.0,...,0.0,2.572347,16.087536,4.67,29,0.33,0,0,0,True


In [186]:
#It is reasonable, they are "churn" or they avgInCalls is 0.
clean_data[(clean_data['avgMins']==0)]

Unnamed: 0,age,children,income,numHandsets,handsetAge,smartPhone,creditRating,homeOwner,creditCard,avgBill,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
226,34,False,6,1,773,True,A,False,True,0.0,...,0.0,1.197605,13.684288,0.0,25,0.0,0,0,0,True
452,60,True,3,3,674,False,D,False,True,0.0,...,0.0,0.0,-0.003758,0.0,34,0.0,0,0,0,True
481,34,False,0,2,323,True,F,True,False,0.0,...,0.0,2.572347,16.087536,4.67,29,0.33,0,0,0,True
619,56,True,8,1,774,True,B,False,True,42.49,...,0.0,0.0,6.908503,0.33,26,0.33,0,0,0,True
635,34,False,0,1,579,True,B,True,False,5.0,...,0.0,0.0,-17.709177,0.0,20,0.0,1,0,0,True
783,34,False,0,1,438,True,B,True,False,3.03,...,0.0,0.0,-13.315054,0.0,14,0.0,0,0,0,False
844,38,False,7,1,530,True,B,False,True,35.16,...,0.0,0.0,1.239732,0.0,17,0.0,0,0,0,True
870,34,False,0,2,463,True,C,True,False,5.0,...,0.0,0.0,7.816691,0.0,31,0.0,0,0,0,True
902,34,False,7,3,562,True,C,False,True,5.0,...,0.0,0.0,9.309546,0.0,37,0.0,0,0,0,True
904,32,False,8,2,1004,False,A,False,True,5.05,...,0.0,0.0,8.787506,0.0,47,0.0,0,0,0,True


In [187]:
#It is reasonable, they are "churn"
clean_data[(clean_data['avgrecurringCharge']==0)]

Unnamed: 0,age,children,income,numHandsets,handsetAge,smartPhone,creditRating,homeOwner,creditCard,avgBill,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
226,34,False,6,1,773,True,A,False,True,0.0,...,0.0,1.197605,13.684288,0.0,25,0.0,0,0,0,True
452,60,True,3,3,674,False,D,False,True,0.0,...,0.0,0.0,-0.003758,0.0,34,0.0,0,0,0,True
481,34,False,0,2,323,True,F,True,False,0.0,...,0.0,2.572347,16.087536,4.67,29,0.33,0,0,0,True
870,34,False,0,2,463,True,C,True,False,5.0,...,0.0,0.0,7.816691,0.0,31,0.0,0,0,0,True


In [188]:
#for numHandsets 
#It is seem that they may have many handsets. It is reasonable, so I keep it.
clean_data.sort_values(by='numHandsets', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(7)

Unnamed: 0,age,children,income,numHandsets,handsetAge,smartPhone,creditRating,homeOwner,creditCard,avgBill,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
258,50,False,5,13,162,True,E,True,True,60.19,...,39.0,2.631648,-29.612219,23.67,54,0.33,1,0,0,True
158,48,False,6,10,16,True,A,False,True,83.58,...,21.33,5.674,-4.903635,17.33,54,0.0,0,0,0,False
919,28,False,1,9,58,True,B,True,True,79.49,...,30.33,1.018839,5.999242,36.0,34,0.33,0,0,0,True
367,48,True,9,8,15,True,C,False,True,59.81,...,19.0,1.02596,10.135476,4.33,25,0.0,0,0,0,False
567,58,False,3,8,237,True,E,False,True,102.61,...,20.67,2.359106,-14.326945,13.67,33,10.67,0,0,0,False
192,36,False,8,8,237,True,A,False,True,124.09,...,63.67,1.735632,2.942796,54.0,46,1.33,0,0,0,True
504,32,False,7,8,10,True,A,False,True,93.02,...,23.0,14.042685,-7.208502,14.67,56,0.0,0,0,0,False


In [189]:
#The handsetAge cannot be less than 0. So I decide to drop the rows with -2 and -1. 
#And for the value 0, it has "lifeTime=15", but they have 5 handsets, so maybe they just brought a new phone. So, it is poosible.
#And for vaule with 2, 3, they all have mutiple handsets, so they are possible.
clean_data.sort_values(by='handsetAge', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last').head(5)

Unnamed: 0,age,children,income,numHandsets,handsetAge,smartPhone,creditRating,homeOwner,creditCard,avgBill,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
168,32,False,1,2,-2,True,E,False,True,68.98,...,0.0,0.0,9.379005,0.0,20,0.0,1,0,0,True
692,60,True,6,4,-1,True,F,False,True,88.98,...,0.0,0.0,1.639013,0.0,40,0.0,0,0,0,False
115,20,False,1,5,0,True,E,True,False,149.98,...,14.0,0.576179,-7.789001,28.67,15,18.67,2,1,0,False
38,34,False,1,3,2,True,C,True,False,26.73,...,1.0,1.589744,-2.880754,11.0,12,4.67,0,0,1,True
669,26,False,9,2,3,True,B,False,True,83.66,...,49.67,2.0,-3.682833,7.0,16,0.0,0,0,0,False


In [190]:
clean_data['handsetAge'].replace(-2, 2, inplace=True)
clean_data['handsetAge'].replace(-1, 1, inplace=True)

In [191]:
clean_data.sort_values(by='handsetAge', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last').head(5)

Unnamed: 0,age,children,income,numHandsets,handsetAge,smartPhone,creditRating,homeOwner,creditCard,avgBill,...,avgInCalls,peakOffPeakRatio,peakOffPeakRatioChangePct,avgDroppedCalls,lifeTime,lastMonthCustomerCareCalls,numRetentionCalls,numRetentionOffersAccepted,newFrequentNumbers,churn
115,20,False,1,5,0,True,E,True,False,149.98,...,14.0,0.576179,-7.789001,28.67,15,18.67,2,1,0,False
692,60,True,6,4,1,True,F,False,True,88.98,...,0.0,0.0,1.639013,0.0,40,0.0,0,0,0,False
168,32,False,1,2,2,True,E,False,True,68.98,...,0.0,0.0,9.379005,0.0,20,0.0,1,0,0,True
38,34,False,1,3,2,True,C,True,False,26.73,...,1.0,1.589744,-2.880754,11.0,12,4.67,0,0,1,True
669,26,False,9,2,3,True,B,False,True,83.66,...,49.67,2.0,-3.682833,7.0,16,0.0,0,0,0,False


In [192]:
#Altough the avgBill is huge, the avgMins is huge, so it is reasonable.
clean_data.sort_values(by='avgBill', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(10).T

Unnamed: 0,670,407,9,495,140,468,599,91,638,532
age,36,50,22,50,52,46,26,30,34,54
children,False,True,False,False,False,False,False,False,False,False
income,1,7,4,1,8,9,7,3,0,7
numHandsets,2,4,1,3,3,1,2,2,2,2
handsetAge,303,88,337,269,83,320,127,62,89,145
smartPhone,True,True,True,True,True,True,True,True,True,True
creditRating,C,F,F,C,B,C,A,F,E,C
homeOwner,False,False,False,False,False,False,False,True,True,False
creditCard,true,true,true,true,false,true,true,false,false,true
avgBill,469.67,309.21,265.04,256.12,248.37,234.46,224.49,218.48,199.5,188.27


In [193]:
#They seem to be reasonable, for they pay more and have many incalls and outcalls, except the second value, they avgbill should be 449.6 but not 44.96
clean_data.sort_values(by='avgMins', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(10).T

Unnamed: 0,670,887,285,374,149,704,338,91,495,661
age,36,50,34,54,34,38,34,30,50,34
children,False,False,False,True,False,True,False,False,False,False
income,1,4,4,5,0,3,6,3,1,0
numHandsets,2,7,5,1,7,1,5,2,3,4
handsetAge,303,489,185,304,28,207,139,62,269,11
smartPhone,True,True,True,True,True,True,True,True,True,True
creditRating,C,A,A,B,E,D,E,F,C,E
homeOwner,False,True,False,False,True,False,True,True,False,True
creditCard,true,true,true,true,false,true,false,false,true,false
avgBill,469.67,44.96,139.58,118.57,167.59,105.77,176.04,218.48,256.12,187.12


In [194]:
# Reassign the avgBill for the person with high avgMin 3584.25.
clean_data.loc[887, 'avgBill']=449.6

In [195]:
#They avgBill and avgrecurringCharge are similar, so I think it is reasonable.
clean_data.sort_values(by='avgrecurringCharge', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(10).T

Unnamed: 0,532,149,661,966,140,403,629,762,579,879
age,54,34,34,44,52,30,38,40,34,56
children,False,False,False,False,False,False,False,False,False,True
income,7,0,0,4,8,5,5,4,8,9
numHandsets,2,7,4,2,3,5,1,3,3,3
handsetAge,145,28,11,298,83,6,338,62,223,493
smartPhone,True,True,True,True,True,True,True,True,True,True
creditRating,C,E,E,A,B,D,B,E,C,A
homeOwner,False,True,True,False,False,False,False,True,False,False
creditCard,true,false,false,true,false,true,true,false,true,true
avgBill,188.27,167.59,187.12,165.18,248.37,106.47,107.24,184.47,115.25,117.16


In [196]:
#It is possible. And they pay more, so it is reasonable.
clean_data.sort_values(by='avgOverBundleMins', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(10).T

Unnamed: 0,670,887,9,407,638,495,91,197,790,600
age,36,50,22,50,34,50,30,54,34,28
children,False,False,False,True,False,False,False,False,False,False
income,1,4,4,7,0,1,3,7,0,6
numHandsets,2,7,1,4,2,3,2,2,1,3
handsetAge,303,489,337,88,89,269,62,402,300,90
smartPhone,True,True,True,True,True,True,True,True,True,True
creditRating,C,A,F,F,E,C,F,A,G,B
homeOwner,False,True,False,False,True,False,True,False,True,False
creditCard,true,true,true,true,false,true,false,true,false,true
avgBill,469.67,449.6,265.04,309.21,199.5,256.12,218.48,181.82,172.69,187.64


In [197]:
#It is reasonable, they can make more avgRoamCalls. And they pay more for that.
clean_data.sort_values(by='avgRoamCalls', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(10).T

Unnamed: 0,440,836,596,140,987,203,805,724,202,480
age,34,46,52,52,34,54,40,36,34,46
children,False,True,True,False,False,True,False,False,False,False
income,5,4,8,8,0,6,5,3,0,4
numHandsets,1,1,1,3,1,2,1,4,2,2
handsetAge,550,319,368,83,231,10,255,378,20,417
smartPhone,True,True,True,True,True,True,True,True,True,True
creditRating,C,G,B,B,B,A,B,A,C,B
homeOwner,True,False,False,False,True,False,False,False,True,False
creditCard,false,false,true,false,false,true,true,true,false,true
avgBill,116.91,187.84,92.42,248.37,85.07,59.97,65.64,149.39,109.64,81.64


In [198]:
#The first item seems to be not reasonable, for the number of customer care calls are too big. 
#The number of customer care calls of fourth item is greater than they avgoutcall, 
#   but they do have huge aveMins, and didn;t pay much. So, I think it is reasonable
clean_data.sort_values(by='lastMonthCustomerCareCalls', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(5).T


Unnamed: 0,363,814,570,704,213
age,40,30,52,38,34
children,False,False,False,True,False
income,4,5,6,3,0
numHandsets,3,5,4,1,4
handsetAge,35,118,480,207,70
smartPhone,True,True,True,True,True
creditRating,E,A,A,D,E
homeOwner,False,False,False,False,True
creditCard,true,false,true,true,false
avgBill,35.1,50.58,75.48,105.77,66.7


In [199]:
#change the number of customer care calls.
clean_data.loc[363, 'lastMonthCustomerCareCalls']=36.567

In [200]:
# It is possible. And they pay more, so it is reasonable
clean_data.sort_values(by='avgOutCalls', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(3).T

Unnamed: 0,363,407,600
age,40,50,28
children,False,True,False
income,4,7,6
numHandsets,3,4,3
handsetAge,35,88,90
smartPhone,True,True,True
creditRating,E,F,B
homeOwner,False,False,False
creditCard,true,true,true
avgBill,35.1,309.21,187.64


In [201]:
# It is possible. And they pay more, so it is reasonable
clean_data.sort_values(by='avgInCalls', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last').head(3).T

Unnamed: 0,149,806,407
age,34,28,50
children,False,False,True
income,0,4,7
numHandsets,7,1,4
handsetAge,28,200,88
smartPhone,True,True,True
creditRating,E,E,F
homeOwner,True,False,False
creditCard,false,true,true
avgBill,167.59,182.85,309.21


In [202]:
clean_data.to_csv('ready2.csv', index=False)

In [203]:
ready_data = pd.read_csv('ready2.csv')

In [204]:
print("done1")

done1
