In [16]:
from pandas import read_pickle, DataFrame, concat
from sklearn.model_selection import train_test_split

In [17]:
data = read_pickle("../data/data_all_features.pkl")

X_train = data.get("X_train")
y_train = data.get("y_train")
X_test = data.get("X_test")
feature_names = data.get("feature_names")
categorical = data.get("categorical")
numerical = data.get("numerical")
id_numeration_train = data.get("id_numeration_train")
id_numeration_test = data.get("id_numeration_test")

In [18]:
def add_positive_revenue_column(visitor_dataframe):
    visitor_dataframe["positive_revenue"] = visitor_dataframe["totals_transaction_revenue"] > 0
    visitor_dataframe["positive_revenue"] = visitor_dataframe["positive_revenue"].astype("float")
    visitor_dataframe = visitor_dataframe.drop(columns=["totals_transaction_revenue"])
    return visitor_dataframe

def get_visitor_dataframe(X_train, y_train, feature_names):
    visitor_dataframe = DataFrame(columns=feature_names, data=X_train)
    visitor_dataframe["totals_transaction_revenue"] = y_train
    
    visitor_dataframe = add_positive_revenue_column(visitor_dataframe)
    
    return visitor_dataframe


In [19]:
visitor_dataframe = get_visitor_dataframe(X_train, y_train, feature_names)


In [20]:
paying_visitor_dataframe = visitor_dataframe[visitor_dataframe["positive_revenue"] == 1]
number_of_rows = len(paying_visitor_dataframe)
non_paying_visitor_dataframe = visitor_dataframe[visitor_dataframe["positive_revenue"] == 0][:number_of_rows]

balanced_dataframe = concat([paying_visitor_dataframe, non_paying_visitor_dataframe])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    balanced_dataframe.drop(columns=["positive_revenue"]), 
    balanced_dataframe["positive_revenue"], 
    test_size = 0.2,
    random_state=1
)

In [22]:
from sklearn.ensemble import RandomForestClassifier
 
forest = RandomForestClassifier(n_estimators=100, n_jobs=20, random_state=2, verbose=True)
forest.fit(X_train, y_train.values.ravel())

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished


In [23]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

prediction = forest.predict(X_test)
matrix = confusion_matrix(y_test, prediction)
# Display accuracy score
accuracy = accuracy_score(y_test, prediction)
# Display F1 score
f1_score = f1_score(y_test, prediction)
print(f"accuracy: {accuracy}")
print(f"f1_score: {f1_score}")
print(matrix)

accuracy: 0.9580890336590663
f1_score: 0.9585570109512563
[[2180  137]
 [  56 2232]]


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.0s finished


In [24]:
import numpy as np
 
importances = forest.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = visitor_dataframe.drop(columns=["positive_revenue"]).columns
 
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

 1) totals_pageviews               0.280534
 2) totals_hits                    0.222943
 3) weekday                        0.094611
 4) totals_bounces                 0.052951
 5) geoNetwork_country_United States 0.046278
 6) geoNetwork_subContinent_Northern America 0.027936
 7) geoNetwork_continent_Americas  0.022962
 8) geoNetwork_country             0.022779
 9) visitNumber                    0.022077
10) trafficSource_source_mall.googleplex.com 0.018773
11) geoNetwork_networkDomain       0.015772
12) geoNetwork_metro               0.014548
13) totals_newVisits               0.013586
14) geoNetwork_city                0.009415
15) geoNetwork_continent_Europe    0.008829
16) device_operatingSystem_Macintosh 0.008472
17) trafficSource_source_youtube.com 0.007021
18) trafficSource_isTrueDirect     0.006920
19) channelGrouping_Referral       0.006348
20) geoNetwork_continent_Asia      0.005941
21) trafficSource_referralPath     0.005679
22) trafficSource_source           0.005230
23) de