In [55]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


#NLTK-------------------------------
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Import libraries for feature 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")



[nltk_data] Downloading package punkt to /Users/yashjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
#Read files
textfile = r'/Users/yashjain/Downloads/Comments.csv'
textData = pd.read_csv(textfile) #creates a dataframe

CustInfofile = r'/Users/yashjain/Downloads/Customers.csv'
CustInfoData = pd.read_csv(CustInfofile)  #creates a dataframe

print(textData.shape)
print(CustInfoData.shape)


(2070, 2)
(2070, 17)


In [32]:
#Tokenize - Split the sentences to lists of words
textData['CommentsTokenized'] = textData['Comments'].apply(word_tokenize)

textData['CommentsTokenized']




0       [Does, not, like, the, way, the, phone, works,...
1       [Wanted, to, know, the, nearest, store, locati...
2       [Wants, to, know, how, to, do, text, messaging...
3       [Asked, how, to, disable, call, waiting, ., re...
4       [Needs, help, learning, how, to, use, the, pho...
                              ...                        
2065    [Needed, help, figuring, out, his, bill, ., I,...
2066    [He, lost, his, phone, and, called, to, cancel...
2067    [Lost, the, directions, to, phone, and, wants,...
2068                      [Wants, to, change, address, .]
2069    [He, lost, his, phone, and, called, to, cancel...
Name: CommentsTokenized, Length: 2070, dtype: object

In [33]:
# Use snowball stemmer.
stemmer = SnowballStemmer("english")

#Now do stemming - create a new dataframe to store stemmed version

textData['CommentsTokenizedstemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

textData['CommentsTokenizedstemmed']


0       [doe, not, like, the, way, the, phone, work, ....
1       [want, to, know, the, nearest, store, locat, ....
2       [want, to, know, how, to, do, text, messag, .,...
3       [ask, how, to, disabl, call, wait, ., refer, h...
4       [need, help, learn, how, to, use, the, phone, ...
                              ...                        
2065    [need, help, figur, out, his, bill, ., i, expl...
2066    [he, lost, his, phone, and, call, to, cancel, ...
2067    [lost, the, direct, to, phone, and, want, anot...
2068                        [want, to, chang, address, .]
2069    [he, lost, his, phone, and, call, to, cancel, ...
Name: CommentsTokenizedstemmed, Length: 2070, dtype: object

In [34]:
# Use Porter stemmer.
stemmer = PorterStemmer()

textData['porterstemmed'] = textData['CommentsTokenized'].apply(lambda x: [stemmer.stem(word) for word in x]) 

textData['porterstemmed']

0       [doe, not, like, the, way, the, phone, work, ....
1       [want, to, know, the, nearest, store, locat, ....
2       [want, to, know, how, to, do, text, messag, .,...
3       [ask, how, to, disabl, call, wait, ., refer, h...
4       [need, help, learn, how, to, use, the, phone, ...
                              ...                        
2065    [need, help, figur, out, hi, bill, ., i, expla...
2066    [he, lost, hi, phone, and, call, to, cancel, s...
2067    [lost, the, direct, to, phone, and, want, anot...
2068                        [want, to, chang, address, .]
2069    [he, lost, hi, phone, and, call, to, cancel, s...
Name: porterstemmed, Length: 2070, dtype: object

In [36]:

#Join stemmed strings
textData['joinedstemmed'] = textData['CommentsTokenizedstemmed'].apply(lambda x: " ".join(x))

textData['joinedstemmed']

0       doe not like the way the phone work . it is to...
1       want to know the nearest store locat . want to...
2       want to know how to do text messag . refer him...
3       ask how to disabl call wait . refer him to web...
4       need help learn how to use the phone . i sugge...
                              ...                        
2065    need help figur out his bill . i explain our m...
2066    he lost his phone and call to cancel servic . ...
2067    lost the direct to phone and want anoth manual...
2068                              want to chang address .
2069    he lost his phone and call to cancel servic . ...
Name: joinedstemmed, Length: 2070, dtype: object

In [37]:
#Do Bag-Of-Words model - Term - Document Matrix
#Learn the vocabulary dictionary and return term-document matrix.

count_vect = CountVectorizer(stop_words='english',lowercase=False)
TD_counts = count_vect.fit_transform(textData.joinedstemmed)
print(TD_counts.shape)
print(TD_counts.dtype)
print(count_vect.get_feature_names())
DF_TD_Counts=pd.DataFrame(TD_counts.toarray())
print(DF_TD_Counts)



(2070, 354)
int64
['3399', '3g', 'abysm', 'access', 'accessori', 'adapt', 'add', 'addit', 'additon', 'address', 'adit', 'adress', 'advertis', 'afraid', 'alway', 'angel', 'angri', 'ani', 'anoth', 'anyth', 'anytim', 'area', 'asap', 'ask', 'bad', 'basic', 'bateri', 'batteri', 'becaus', 'believ', 'better', 'bigger', 'book', 'bought', 'brain', 'bring', 'built', 'busi', 'button', 'buy', 'cancel', 'cancer', 'car', 'care', 'carrier', 'caus', 'cc', 'cell', 'certain', 'chang', 'charg', 'charger', 'check', 'chip', 'citi', 'claim', 'cleariti', 'cold', 'comapr', 'compani', 'compar', 'competit', 'complain', 'complaint', 'concept', 'connect', 'consisit', 'consist', 'constan', 'contact', 'continu', 'contract', 'correct', 'cost', 'coupl', 'cover', 'coverag', 'creat', 'credit', 'cstmer', 'cstmr', 'current', 'cust', 'custom', 'customr', 'date', 'day', 'dead', 'decent', 'defect', 'deo', 'did', 'die', 'differ', 'difficult', 'digiti', 'direct', 'disabl', 'doe', 'don', 'dont', 'drop', 'dure', 'easier', 'effe

In [39]:
#Compute TF-IDF Matrix
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(TD_counts)
print(X_train_tfidf.shape)
DF_TF_IDF=pd.DataFrame(X_train_tfidf.toarray())
print(DF_TF_IDF)


(2070, 354)
      0    1    2    3        4    5    6    7    8         9    ...  344  \
0     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
1     0.0  0.0  0.0  0.0  0.27568  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
3     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
4     0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
...   ...  ...  ...  ...      ...  ...  ...  ...  ...       ...  ...  ...   
2065  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2066  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2067  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   
2068  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.772949  ...  0.0   
2069  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0  0.0  0.000000  ...  0.0   

      345  346       347  348  349  350  351  352  353  
0     

In [41]:
#Customer Info One-Hot Encoded
#combining with customer data - one hot encoding

cust_tfidf = pd.merge(CustInfoData ,DF_TF_IDF , left_index=True, right_index=True )
cust_tfidf = cust_tfidf.drop(columns = ['TARGET' , 'ID'])
cust_tfidf

y = CustInfoData['TARGET']

categorical_var = ["Sex","Status","Car_Owner","Paymethod","LocalBilltype","LongDistanceBilltype"]

cust_tfidf_encoded = pd.get_dummies(cust_tfidf , columns = categorical_var)
cust_tfidf_encoded

Unnamed: 0,Children,Est_Income,Usage,Age,RatePlan,LongDistance,International,Local,Dropped,0,...,Status_S,Car_Owner_N,Car_Owner_Y,Paymethod_Auto,Paymethod_CC,Paymethod_CH,LocalBilltype_Budget,LocalBilltype_FreeLocal,LongDistanceBilltype_Intnl_discount,LongDistanceBilltype_Standard
0,1,38000.00,229.64,24.393333,3,23.56,0.00,206.08,0,0.0,...,1,1,0,0,1,0,1,0,1,0
1,2,29616.00,75.29,49.426667,2,29.78,0.00,45.50,0,0.0,...,0,1,0,0,0,1,0,1,0,1
2,0,19732.80,47.25,50.673333,3,24.81,0.00,22.44,0,0.0,...,0,1,0,0,1,0,0,1,0,1
3,2,96.33,59.01,56.473333,1,26.13,0.00,32.88,1,0.0,...,1,1,0,0,1,0,1,0,0,1
4,2,52004.80,28.14,25.140000,1,5.03,0.00,23.11,0,0.0,...,0,1,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2065,0,78851.30,29.04,48.373333,4,0.37,0.00,28.66,0,0.0,...,1,1,0,0,1,0,0,1,0,1
2066,1,17540.70,36.20,62.786667,1,22.17,0.57,13.45,0,0.0,...,1,0,1,1,0,0,1,0,0,1
2067,0,83891.90,74.40,61.020000,4,28.92,0.00,45.47,0,0.0,...,0,0,1,0,0,1,1,0,0,1
2068,2,28220.80,38.95,38.766667,4,26.49,0.00,12.46,0,0.0,...,0,1,0,0,1,0,0,1,0,1


In [61]:
#Feature selection - SelectKBest (FILTER)

select = SelectKBest(score_func=chi2 , k=50)


kbest = select.fit_transform(cust_tfidf_encoded,y)

feature_50 = select.get_support(indices=True)
print(feature_50)

kbest= pd.DataFrame(kbest)
print(kbest)

[  0   1   2   3   5   6   7   8  18  20  23  35  58  59  60  71  79  90
 124 127 130 153 166 192 195 199 203 221 226 231 248 253 257 258 268 275
 282 322 324 328 332 347 363 364 366 367 370 371 375 376]
       0         1       2          3      4     5       6    7         8   \
0     1.0  38000.00  229.64  24.393333  23.56  0.00  206.08  0.0  0.000000   
1     2.0  29616.00   75.29  49.426667  29.78  0.00   45.50  0.0  0.000000   
2     0.0  19732.80   47.25  50.673333  24.81  0.00   22.44  0.0  0.000000   
3     2.0     96.33   59.01  56.473333  26.13  0.00   32.88  1.0  0.000000   
4     2.0  52004.80   28.14  25.140000   5.03  0.00   23.11  0.0  0.000000   
...   ...       ...     ...        ...    ...   ...     ...  ...       ...   
2065  0.0  78851.30   29.04  48.373333   0.37  0.00   28.66  0.0  0.000000   
2066  1.0  17540.70   36.20  62.786667  22.17  0.57   13.45  0.0  0.000000   
2067  0.0  83891.90   74.40  61.020000  28.92  0.00   45.47  0.0  0.000000   
2068  2.0  28220

In [62]:
#Feature selection - SelectKBest (FILTER)

select = SelectKBest(score_func=chi2 , k=10)


kbest = select.fit_transform(cust_tfidf_encoded,y)

feature_10 = select.get_support(indices=True)
print(feature_10)

kbest= pd.DataFrame(kbest)
print(kbest)

[  0   1   2   5   6   7 364 366 367 370]
        0         1       2      3     4       5    6    7    8    9
0     1.0  38000.00  229.64  23.56  0.00  206.08  0.0  0.0  1.0  0.0
1     2.0  29616.00   75.29  29.78  0.00   45.50  1.0  1.0  0.0  0.0
2     0.0  19732.80   47.25  24.81  0.00   22.44  1.0  1.0  0.0  0.0
3     2.0     96.33   59.01  26.13  0.00   32.88  1.0  0.0  1.0  0.0
4     2.0  52004.80   28.14   5.03  0.00   23.11  0.0  1.0  0.0  0.0
...   ...       ...     ...    ...   ...     ...  ...  ...  ...  ...
2065  0.0  78851.30   29.04   0.37  0.00   28.66  0.0  0.0  1.0  0.0
2066  1.0  17540.70   36.20  22.17  0.57   13.45  0.0  0.0  1.0  1.0
2067  0.0  83891.90   74.40  28.92  0.00   45.47  0.0  1.0  0.0  0.0
2068  2.0  28220.80   38.95  26.49  0.00   12.46  0.0  1.0  0.0  0.0
2069  0.0  28589.10  100.28  13.19  0.00   87.09  0.0  0.0  1.0  0.0

[2070 rows x 10 columns]


In [63]:
#Feature selection - SelectKBest (FILTER)

select = SelectKBest(score_func=chi2 , k=15)


kbest = select.fit_transform(cust_tfidf_encoded,y)

feature_15 = select.get_support(indices=True)
print(feature_15)

kbest= pd.DataFrame(kbest)
print(kbest)

[  0   1   2   5   6   7   8 248 363 364 366 367 370 371 375]
       0         1       2      3     4       5    6    7    8    9    10  \
0     1.0  38000.00  229.64  23.56  0.00  206.08  0.0  0.0  1.0  0.0  0.0   
1     2.0  29616.00   75.29  29.78  0.00   45.50  0.0  0.0  0.0  1.0  1.0   
2     0.0  19732.80   47.25  24.81  0.00   22.44  0.0  0.0  0.0  1.0  1.0   
3     2.0     96.33   59.01  26.13  0.00   32.88  1.0  0.0  0.0  1.0  0.0   
4     2.0  52004.80   28.14   5.03  0.00   23.11  0.0  0.0  1.0  0.0  1.0   
...   ...       ...     ...    ...   ...     ...  ...  ...  ...  ...  ...   
2065  0.0  78851.30   29.04   0.37  0.00   28.66  0.0  0.0  1.0  0.0  0.0   
2066  1.0  17540.70   36.20  22.17  0.57   13.45  0.0  0.0  1.0  0.0  0.0   
2067  0.0  83891.90   74.40  28.92  0.00   45.47  0.0  0.0  1.0  0.0  1.0   
2068  2.0  28220.80   38.95  26.49  0.00   12.46  0.0  0.0  1.0  0.0  1.0   
2069  0.0  28589.10  100.28  13.19  0.00   87.09  0.0  0.0  1.0  0.0  0.0   

       11   1

In [64]:
#Do feature selection using a classification model---Wrapper

clf = RandomForestClassifier()
model = SelectFromModel(clf, max_features=7, threshold=-np.inf).fit(cust_tfidf_encoded,y)

feature_rfc = model.get_support(indices=True)
print(feature_rfc)

rfc = pd.DataFrame(model.transform(cust_tfidf_encoded))

rfc

[  0   1   2   3   5   7 367]


Unnamed: 0,0,1,2,3,4,5,6
0,1.0,38000.00,229.64,24.393333,23.56,206.08,1.0
1,2.0,29616.00,75.29,49.426667,29.78,45.50,0.0
2,0.0,19732.80,47.25,50.673333,24.81,22.44,0.0
3,2.0,96.33,59.01,56.473333,26.13,32.88,1.0
4,2.0,52004.80,28.14,25.140000,5.03,23.11,0.0
...,...,...,...,...,...,...,...
2065,0.0,78851.30,29.04,48.373333,0.37,28.66,1.0
2066,1.0,17540.70,36.20,62.786667,22.17,13.45,1.0
2067,0.0,83891.90,74.40,61.020000,28.92,45.47,0.0
2068,2.0,28220.80,38.95,38.766667,26.49,12.46,0.0


In [65]:
#Do feature selection using a classification model---Wrapper

clf = GradientBoostingClassifier()

model = SelectFromModel(clf, max_features=7, threshold=-np.inf).fit(cust_tfidf_encoded,y)

feature_gbc = model.get_support(indices=True)
print(feature_gbc)
X_new = model.transform(cust_tfidf_encoded)
X_new_SelectedFeatures= pd.DataFrame(X_new)

print(X_new_SelectedFeatures)



[  0   1   3   5   6 367 370]
        0         1          2      3     4    5    6
0     1.0  38000.00  24.393333  23.56  0.00  1.0  0.0
1     2.0  29616.00  49.426667  29.78  0.00  0.0  0.0
2     0.0  19732.80  50.673333  24.81  0.00  0.0  0.0
3     2.0     96.33  56.473333  26.13  0.00  1.0  0.0
4     2.0  52004.80  25.140000   5.03  0.00  0.0  0.0
...   ...       ...        ...    ...   ...  ...  ...
2065  0.0  78851.30  48.373333   0.37  0.00  1.0  0.0
2066  1.0  17540.70  62.786667  22.17  0.57  1.0  1.0
2067  0.0  83891.90  61.020000  28.92  0.00  0.0  0.0
2068  2.0  28220.80  38.766667  26.49  0.00  0.0  0.0
2069  0.0  28589.10  15.600000  13.19  0.00  1.0  0.0

[2070 rows x 7 columns]


In [66]:
# Train test split 

X_train, X_test, y_train, y_test = train_test_split(cust_tfidf_encoded,y, test_size=0.20, random_state=42)

In [67]:
#filter classification

model = GradientBoostingClassifier(n_estimators = 50)

accuracy_50 = cross_val_score(model , X_train.iloc[ : ,feature_50] , y_train , scoring = 'accuracy')
accuracy_10 = cross_val_score(model , X_train.iloc[ : ,feature_10] , y_train , scoring = 'accuracy')
accuracy_15 = cross_val_score(model , X_train.iloc[ : ,feature_15] , y_train , scoring = 'accuracy')

print(np.mean(accuracy_50))
print(np.mean(accuracy_10))
print(np.mean(accuracy_15))

0.8496505660102646
0.8430040403305064
0.8405980417136825


In [68]:
model = model.fit( X_train.iloc[ : ,feature_15] , y_train)
pred  = model.predict(X_test.iloc[ : ,feature_15])
accuracy_score(y_test , pred)

0.8236714975845411

In [60]:
# wrapper classification
model = GradientBoostingClassifier(n_estimators = 50)

accuracy_rfc = cross_val_score(model , X_train.iloc[ : ,feature_rfc] , y_train , scoring = 'accuracy')
accuracy_gbc = cross_val_score(model , X_train.iloc[ : ,feature_gbc] , y_train , scoring = 'accuracy')

print(np.mean(accuracy_rfc))
print(np.mean(accuracy_gbc))

0.815826447785098
0.8315254977614386


In [69]:
model = model.fit( X_train.iloc[ : ,feature_rfc] , y_train)
pred  = model.predict(X_test.iloc[ : ,feature_rfc])
accuracy_score(y_test , pred)

0.8091787439613527