In [40]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [41]:
# Display Settings
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 190)

In [42]:
#Set working directory
os.chdir('/Users/valentinas/Desktop/Py/Rank Sales Leads')

In [59]:
#Data
train = pd.read_csv("train.csv", dtype={'PhoneNumber':str})
test = pd.read_csv("test.csv", dtype={'PhoneNumber':str})
display(train.head(n=10))
display(test.head(n=1))

Unnamed: 0,LeadID,CompanyName,TypeOfBusiness,FacebookLikes,TwitterFollowers,Website,PhoneNumber,Contact,Sale
0,8,Lots O' Stuff,corner store,,,lotsostuff.net,3106358130,owner,False
1,9,Rampart Supplies,corner store,7.0,,,5045973037,general line,False
2,4,Joe's Hobby Shop,hobbies & toys,,,,5043304798,other,False
3,28,Smokes and Beer,corner store,,,,5046995720,other,False
4,10,Missy's,cothing,,,missysclothesstore.net,5045705441,other,False
5,17,Sunshine Boutique,cothing,2.0,11.0,,5046647993,manager,False
6,21,Taylor Anne,cothing,163.0,14.0,taylorannestore.com,6464316190,general line,True
7,24,The Reading Corner,,17.0,,,6467041649,other,False
8,23,Hungry Hippo,restaurant,51.0,28.0,hhcafe.com,5049414938,owner,True
9,16,Frank's Shoes,cothing,17.0,3.0,franksshoes.com,6465266938,owner,True


Unnamed: 0,LeadID,CompanyName,TypeOfBusiness,FacebookLikes,TwitterFollowers,Website,PhoneNumber,Contact,Sale
0,2,The Law Offices of Smith,law office,87.0,46.0,smithlaw.net,3109859670,other,False


In [44]:
train.Sale.sum()/train.shape[0]

0

In [45]:
# TypeOfBusiness

# Convert NaN to "NA_Val"
train.TypeOfBusiness.fillna('NA_Val', inplace=True)
test.TypeOfBusiness.fillna('NA_Val', inplace=True)

# To help avoid overfitting, and to reduce the number of columns generated from one-hot-encoding,
# we will mark uncommon business types as "other" (freq <= 1)
tobMap = train.groupby('TypeOfBusiness')['TypeOfBusiness'].agg({'count'}).reset_index()
tobMap['TOBGroup'] = tobMap.TypeOfBusiness
tobMap.loc[tobMap['count'] <= 1, 'TOBGroup'] = 'other'
train = train.merge(tobMap.drop('count', axis=1), on='TypeOfBusiness', how='left')
test = test.merge(tobMap.drop('count', axis=1), on='TypeOfBusiness', how='left')

# one-hot-encode
tob_groups = tobMap.TOBGroup.unique()
train['TOBGroup'] = pd.Categorical(train.TOBGroup, categories=tob_groups)
train_tob_dummies = pd.get_dummies(train.TOBGroup, prefix='TOB')
train = pd.concat([train, train_tob_dummies], axis=1)
test['TOBGroup'] = pd.Categorical(test.TOBGroup, categories=tob_groups)
test_tob_dummies = pd.get_dummies(test.TOBGroup, prefix='TOB')
test = pd.concat([test, test_tob_dummies], axis=1)


In [46]:
print test.TypeOfBusiness,train.TypeOfBusiness

0        law office
1            NA_Val
2        restaurant
3    hobbies & toys
4        restaurant
5           grocery
6            NA_Val
7            NA_Val
8             books
9        law office
Name: TypeOfBusiness, dtype: object 0       corner store
1       corner store
2     hobbies & toys
3       corner store
4            cothing
           ...      
15        auto parts
16        restaurant
17           grocery
18        auto parts
19        law office
Name: TypeOfBusiness, Length: 20, dtype: object


In [47]:
# Website Extension

extensions = ['none', 'com', 'org', 'net', 'other']

# train
train.loc[train.Website.isnull(), 'WebsiteExtension'] = 'none'
train.loc[train.Website.str.contains('com').replace(np.nan, False), 'WebsiteExtension'] = 'com'
train.loc[train.Website.str.contains('org').replace(np.nan, False), 'WebsiteExtension'] = 'org'
train.loc[train.Website.str.contains('net').replace(np.nan, False), 'WebsiteExtension'] = 'net'
train.loc[train.WebsiteExtension.isnull(), 'WebsiteExtension'] = 'other'
train['WebsiteExtension'] = pd.Categorical(train.WebsiteExtension, categories=extensions)
train_extension_dummies = pd.get_dummies(train.WebsiteExtension, prefix='EX')
train = pd.concat([train, train_extension_dummies], axis=1)

# test
test.loc[test.Website.isnull(), 'WebsiteExtension'] = 'none'
test.loc[test.Website.str.contains('com').replace(np.nan, False), 'WebsiteExtension'] = 'com'
test.loc[test.Website.str.contains('org').replace(np.nan, False), 'WebsiteExtension'] = 'org'
test.loc[test.Website.str.contains('net').replace(np.nan, False), 'WebsiteExtension'] = 'net'
test.loc[test.WebsiteExtension.isnull(), 'WebsiteExtension'] = 'other'
test['WebsiteExtension'] = pd.Categorical(test.WebsiteExtension, categories=extensions)
test_extension_dummies = pd.get_dummies(test.WebsiteExtension, prefix='EX')
test = pd.concat([test, test_extension_dummies], axis=1)

In [48]:
# AreaCode

# train
train['AreaCode'] = train.PhoneNumber.str[:3]
train['AreaCode'] = pd.Categorical(train.AreaCode)
train_areacode_dummies = pd.get_dummies(train.AreaCode, prefix='AC')
train = pd.concat([train, train_areacode_dummies], axis=1)

# test
test['AreaCode'] = test.PhoneNumber.str[:3]
test['AreaCode'] = pd.Categorical(test.AreaCode, categories=train.AreaCode.cat.categories)
test_areacode_dummies = pd.get_dummies(test.AreaCode, prefix='AC')
test = pd.concat([test, test_areacode_dummies], axis=1)

#--------------------------------------------------
# Contact (convert to numeric, 1-4)

# In this case, we know all the possible contact types
contacts = ["general line", "other", "manager", "owner"]  # Note the order of the elements
train['Contact'] = pd.Categorical(train.Contact, categories=contacts, ordered=True).codes
test['Contact'] = pd.Categorical(test.Contact, categories=contacts, ordered=True).codes

#--------------------------------------------------
# FacebookLikes

train.FacebookLikes.fillna(-1, inplace=True)
test.FacebookLikes.fillna(-1, inplace=True)

#--------------------------------------------------
# TwitterFollowers

train.TwitterFollowers.fillna(-1, inplace=True)
test.TwitterFollowers.fillna(-1, inplace=True)


In [49]:
# Random Forest Model

features = ['Contact', 'FacebookLikes', 'TwitterFollowers'] + train_tob_dummies.columns.tolist() + train_areacode_dummies.columns.tolist() + test_areacode_dummies.columns.tolist()
rf = RandomForestClassifier(n_estimators=200, max_features=.33, min_samples_leaf=3, random_state=2016)
rf.fit(X=train[features].values, y=train.Sale.values)
#--------------------------------------------------
# Check the importance of features

importances = pd.DataFrame({'Feature':features, 'Importance':rf.feature_importances_})
importances.sort_values('Importance', ascending=False)

#======================================================================================================
# Make some predictions on the test set & evaluate the results

test['ProbSale'] = rf.predict_proba(test[features].values)[:,1]


In [50]:
print test['ProbSale']

0    0.361556
1    0.566923
2    0.421090
3    0.088121
4    0.297857
5    0.534341
6    0.636237
7    0.228545
8    0.282940
9    0.646413
Name: ProbSale, dtype: float64


In [51]:
# Rank the predictions from most likely to least likely

test.sort_values('ProbSale', inplace=True, ascending=False)
test['ProbSaleRk'] = np.arange(test.shape[0])

#--------------------------------------------------
# Take a look

test[['ProbSaleRk', 'CompanyName', 'ProbSale', 'Sale']]  

#--------------------------------------------------
# Evaluate the results using area under the ROC curve

roc_auc_score(y_true=test.Sale, y_score=test.ProbSale)  # 0.75

0.75