### Name： Zhong He， Cai Juan Luo
### Date: September 21st 2017
### Project: Fund Raising Prediction - Direct Marketing to PVA Donors

# Step 1: Data Preparation

## 1.1 load dataset

In [1]:
import pandas as pd
import numpy as np
import sklearn

learndata = pd.read_csv('cup98LRN.txt', header = 0, dtype={"NOEXCH": np.str})
#valdata = pd.read_csv('cup98VAL.txt', header = 0)
#valdata.head(5)
print('TARGET_B' in learndata )

True


## 1.2 target distribution 

In [4]:
print('Number of obs in learning data:',len(learndata))

respond= pd.value_counts(learndata['TARGET_B'])
print(respond)
print(4843+90569-len(learndata) )
print('no missing values for TARGET_B')
print('only', 4843/95412*100, '% donors respond')

Number of obs in learning data: 95412
0    90569
1     4843
Name: TARGET_B, dtype: int64
0
no missing values for TARGET_B
only 5.075881440489666 % donors respond


## 1.3 encode data into numerics

In [5]:
from sklearn import preprocessing

#First, find all string columns
string=learndata.select_dtypes(['object'])
print('number of string column:',len(string.columns))

#remove all space in string data
learndata[string.columns] = string.apply(lambda x: x.str.strip())

# label encoder
for x in string.columns:
    le = preprocessing.LabelEncoder()
    learndata[x][learndata[x].isnull()] = 'NaN' # replace missing value into NaN
    le.fit(learndata[x])
    learndata[x]=le.transform(learndata[x]) 
    
#check again
string=learndata.select_dtypes(['object'])
print('number of string column:',len(string.columns))

# Convert values back to their category names
#list(le.inverse_transform(x))




number of string column: 74


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


number of string column: 0


## 1.4 feature selection

In [6]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

X=learndata.drop('TARGET_B', axis=1)
Y=learndata.TARGET_B

X = X.fillna(-1,axis=1)

clf = ExtraTreesClassifier(n_estimators=1000,random_state =1234)
clf = clf.fit(X, Y)
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
print(X_new.shape)


# get the column names back for X_new
count=0
feature= list()
for col in model.get_support():
    if col == True:
        feature.append(count) 
    count = count +1
print(len(feature))

name=learndata.columns[feature]

X_new= pd.DataFrame(X_new, columns=name)
#X_new

(95412, 17)
17


## 1.5 oversampling to 70/30

In [5]:
from sklearn.utils import resample


x1=X_new[X_new.TARGET_B==1]
y0=X_new[X_new.TARGET_B==0]

oversampled = resample(x1,replace=True,n_samples=38826,random_state=123) 
df = pd.concat([oversampled, y0])
new_rate=pd.value_counts(df['TARGET_B'])
print(new_rate)
print('now', 38826/len(df)*100, '% donors respond in our new oversampled dataset')
print('Number of obs in new dataset:',len(df))

0.0    90569
1.0    38826
Name: TARGET_B, dtype: int64
now 30.005796205417518 % donors respond in our new oversampled dataset
Number of obs in new dataset: 129395


## 1.6 data partition

In [6]:
# df_train 70%
# df_val   30%
from sklearn.model_selection import train_test_split

X, y = df.drop('TARGET_B', axis=1), df.TARGET_B
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)
print(len(y_train[y_train==1])/len(y_train))


0.30006845080374495


# Step 2: Model

## 2.1 Logistic Regression



In [7]:
import statsmodels.api as sm

logit = sm.Logit(y_train, X_train)

# fit the model
result = logit.fit()
print(result.summary())

  from pandas.core import datetools


Optimization terminated successfully.
         Current function value: 0.276392
         Iterations 12
                           Logit Regression Results                           
Dep. Variable:               TARGET_B   No. Observations:                90576
Model:                          Logit   Df Residuals:                    90560
Method:                           MLE   Df Model:                           15
Date:                Mon, 25 Sep 2017   Pseudo R-squ.:                  0.5476
Time:                        20:47:05   Log-Likelihood:                -25035.
converged:                       True   LL-Null:                       -55335.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
TCODE         -0.0057      0.001    -10.614      0.000      -0.007      -0.005
POP901         0.0017   3.68

In [8]:
logit = sm.Logit(y_test, X_test)

# fit the model
result = logit.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.279379
         Iterations 11
                           Logit Regression Results                           
Dep. Variable:               TARGET_B   No. Observations:                38819
Model:                          Logit   Df Residuals:                    38803
Method:                           MLE   Df Model:                           15
Date:                Mon, 25 Sep 2017   Pseudo R-squ.:                  0.5427
Time:                        20:47:14   Log-Likelihood:                -10845.
converged:                       True   LL-Null:                       -23714.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
TCODE         -0.0047      0.001     -8.488      0.000      -0.006      -0.004
POP901         0.0015   5.29

## 2.2 Decision Tree

In [9]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

tree.export_graphviz(clf, out_file='tree.dot',feature_names=X_train.columns )




To see whole tree:
    copy code in 'tree.dot' file and
    paste it to http://webgraphviz.com/

![alt text](Capture2.PNG)

## 2.3 Random Forrest

In [12]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=1000,random_state =111)
clf = clf.fit(X_train, y_train) 

from sklearn.metrics import r2_score
predicted_train = clf.predict(X_train)
train_score = r2_score(y_train, predicted_train)
print('Train data R-2 score:',train_score )

predicted_test = clf.predict(X_test)
test_score = r2_score(y_test, predicted_test)
print('Test data R-2 score:',test_score )


importance = clf.feature_importances_
importance = pd.DataFrame(importance, index=X_train.columns, 
                          columns=["Importance"])
importance.sort_values(by='Importance',ascending= False)

Train data R-2 score: 0.999963703252
Test data R-2 score: 0.999682054237


Unnamed: 0,Importance
RFA_2,0.2592
RFA_2R,0.252704
POP903,0.184759
LASTDATE,0.09794
MAXRDATE,0.064224
NEXTDATE,0.056063
TIMELAG,0.048103
NUMPRM12,0.022766
POP901,0.005293
RFA_2F,0.003416
