In [1]:
from pandas import read_pickle, DataFrame
from sklearn.model_selection import train_test_split

In [2]:
data = read_pickle("../data/data_all_features.pkl")

X_train = data.get("X_train")
y_train = data.get("y_train")
X_test = data.get("X_test")
feature_names = data.get("feature_names")
categorical = data.get("categorical")
numerical = data.get("numerical")
id_numeration_train = data.get("id_numeration_train")
id_numeration_test = data.get("id_numeration_test")

In [3]:
def get_visitor_dataframe(X_train, y_train, feature_names):
    visitor_dataframe = DataFrame(columns=feature_names, data=X_train)
    visitor_dataframe["totals_transaction_revenue"] = y_train
    return visitor_dataframe

def add_positive_revenue_column(visitor_dataframe):
    visitor_dataframe["positive_revenue"] = visitor_dataframe["totals_transaction_revenue"] > 0
    visitor_dataframe["positive_revenue"] = visitor_dataframe["positive_revenue"].astype("float")
    return visitor_dataframe

In [4]:
visitor_dataframe = get_visitor_dataframe(X_train, y_train, feature_names)
visitor_dataframe = add_positive_revenue_column(visitor_dataframe)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    visitor_dataframe.drop(columns=["positive_revenue", "totals_transaction_revenue"]), 
    visitor_dataframe["positive_revenue"], 
    test_size = 0.3,
    random_state=1
)

In [6]:
from sklearn.ensemble import RandomForestClassifier
 
forest = RandomForestClassifier(n_estimators=250, n_jobs=50, random_state=1, verbose=True)
#
# Train the mode
#
forest.fit(X_train, y_train.values.ravel())

[Parallel(n_jobs=50)]: Using backend ThreadingBackend with 50 concurrent workers.


In [11]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

prediction = forest.predict(X_test)
matrix = confusion_matrix(y_test, prediction)
# Display accuracy score
accuracy = accuracy_score(y_test, prediction)
# Display F1 score
f1_score = f1_score(y_test, prediction)
print(f"accuracy: {accuracy}") 
print(f"f1_score: {f1_score}") 
print(matrix)

[Parallel(n_jobs=50)]: Using backend ThreadingBackend with 50 concurrent workers.
[Parallel(n_jobs=50)]: Done 100 tasks      | elapsed:    0.5s


accuracy: 0.9869067707429466
f1_score: 0.30031570639305444


[Parallel(n_jobs=50)]: Done 250 out of 250 | elapsed:    1.2s finished


In [12]:
import numpy as np
 
importances = forest.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = visitor_dataframe.drop(columns=["positive_revenue", "totals_transaction_revenue"]).columns
 
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

 1) totals_pageviews               0.263584
 2) totals_hits                    0.254350
 3) weekday                        0.097415
 4) visitNumber                    0.064774
 5) geoNetwork_networkDomain       0.057273
 6) geoNetwork_city                0.017239
 7) device_operatingSystem         0.011241
 8) geoNetwork_metro               0.010767
 9) trafficSource_isTrueDirect     0.010330
10) geoNetwork_region              0.009831
11) geoNetwork_networkDomain_comcast.net 0.008407
12) trafficSource_source_mall.googleplex.com 0.008013
13) totals_newVisits               0.007862
14) device_operatingSystem_Macintosh 0.007611
15) geoNetwork_country             0.006828
16) trafficSource_source           0.006489
17) geoNetwork_country_United States 0.006224
18) totals_bounces                 0.005249
19) geoNetwork_subContinent_Northern America 0.005209
20) device_operatingSystem_Windows 0.005105
21) geoNetwork_networkDomain_verizon.net 0.005081
22) geoNetwork_networkDomain_rr.com 0.00