In [39]:
from pandas import read_pickle, DataFrame, concat
from sklearn.model_selection import train_test_split

In [7]:
data = read_pickle("app/data/data_all_features.pkl")

X_train = data.get("X_train")
y_train = data.get("y_train")
X_test = data.get("X_test")
feature_names = data.get("feature_names")
categorical = data.get("categorical")
numerical = data.get("numerical")
id_numeration_train = data.get("id_numeration_train")
id_numeration_test = data.get("id_numeration_test")

In [20]:
def add_positive_revenue_column(visitor_dataframe):
    visitor_dataframe["positive_revenue"] = visitor_dataframe["totals_transaction_revenue"] > 0
    visitor_dataframe["positive_revenue"] = visitor_dataframe["positive_revenue"].astype("float")
    visitor_dataframe = visitor_dataframe.drop(columns=["totals_transaction_revenue"])
    return visitor_dataframe

def get_visitor_dataframe(X_train, y_train, feature_names):
    visitor_dataframe = DataFrame(columns=feature_names, data=X_train)
    visitor_dataframe["totals_transaction_revenue"] = y_train
    
    visitor_dataframe = add_positive_revenue_column(visitor_dataframe)
    
    return visitor_dataframe


In [21]:
visitor_dataframe = get_visitor_dataframe(X_train, y_train, feature_names)


In [40]:
paying_visitor_dataframe = visitor_dataframe[visitor_dataframe["positive_revenue"] == 1]
number_of_rows = len(paying_visitor_dataframe)
non_paying_visitor_dataframe = visitor_dataframe[visitor_dataframe["positive_revenue"] == 0][:number_of_rows]

balanced_dataframe = concat([paying_visitor_dataframe, non_paying_visitor_dataframe])

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    visitor_dataframe.drop(columns=["positive_revenue"]), 
    visitor_dataframe["positive_revenue"], 
    test_size = 0.2,
    random_state=1
)

In [45]:
from sklearn.ensemble import RandomForestClassifier
 
forest = RandomForestClassifier(n_estimators=100, n_jobs=20, random_state=2, verbose=True)
forest.fit(X_train, y_train.values.ravel())

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    5.3s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:   26.0s finished


In [46]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

prediction = forest.predict(X_test)
confusion_matrix(y_test, prediction)
# Display accuracy score
accuracy = accuracy_score(y_test, prediction)
# Display F1 score
f1_score = f1_score(y_test, prediction)
print(f"accuracy: {accuracy}")
print(f"f1_score: {f1_score}")

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s


accuracy: 0.986879053563813
f1_score: 0.3083211678832117


[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.3s finished


In [48]:
import numpy as np
 
importances = forest.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = visitor_dataframe.drop(columns=["positive_revenue"]).columns
 
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

 1) totals_pageviews               0.269958
 2) totals_hits                    0.246995
 3) weekday                        0.097834
 4) visitNumber                    0.066160
 5) geoNetwork_networkDomain       0.057714
 6) geoNetwork_city                0.016942
 7) device_operatingSystem         0.010986
 8) geoNetwork_metro               0.010702
 9) trafficSource_isTrueDirect     0.010376
10) geoNetwork_region              0.009684
11) totals_newVisits               0.008476
12) trafficSource_source_mall.googleplex.com 0.008409
13) geoNetwork_networkDomain_comcast.net 0.008205
14) device_operatingSystem_Macintosh 0.007217
15) geoNetwork_country             0.006928
16) geoNetwork_country_United States 0.006454
17) geoNetwork_subContinent_Northern America 0.005912
18) trafficSource_source           0.005616
19) geoNetwork_continent_Americas  0.005597
20) device_operatingSystem_Windows 0.005206
21) geoNetwork_networkDomain_verizon.net 0.005044
22) geoNetwork_networkDomain_rr.com 0.00