# Read & Clear Data

In [2]:
import numpy as np
import pandas as pd
import feather 
import matplotlib.pyplot as plt  
from time import time
from mailerWithUtf8 import mail
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split  
from sklearn.externals import joblib
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.utils import shuffle
from glob import glob
import os

In [3]:
def save_dataframe(df, out_filename):
   # write to csv and no index
    t0 = time()
    df.to_csv(out_filename + ".csv", index=False, encoding='utf-8')
#     df.to_csv(out_filename + ".csv", encoding='utf-8')
    print("time for output csv file: %.2f" % (time()-t0))

In [4]:
def random_forest(train_df):
    train_x, train_y = train_df.iloc[:, 0:-1].values, train_df.iloc[:, -1].values
    le = preprocessing.LabelEncoder()
    le.fit(train_df["Groups"].unique())
    train_numeric_y = le.transform(train_y)
    rf = RandomForestClassifier(max_features='auto',
                                random_state=42,
                                n_jobs=-1,
                                n_estimators = 100)
    param_grid = {
                  "min_samples_leaf" : [10], 
                  "min_samples_split" : [2],
                  "max_depth" : [25],
                  "n_estimators": [100]}
#     param_grid = {
#                   "min_samples_leaf" : [10],                   
#                   "n_estimators": [100]}
    gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=1)
    grid_clf = gs.fit(train_x, train_numeric_y)
    clf = grid_clf.best_estimator_
    
    clf_info = str(("Accuracy on training set: %f" % gs.cv_results_["mean_test_score"][0])) + '\n'
    # clf_info += str(("Accuracy on test set: %f" % clf.score(test_x, test_numeric_y))) + '\n'
    clf_info += str(('fit time %s seconds' % format(time() - start_time))) + '\n'
#     print(clf_info)
    important_dict = dict(zip(train_df.columns[:-1],clf.feature_importances_))
    important_list = sorted(important_dict.items(), key=lambda x: x[1])
    important_list.reverse()
    clf_info += '\n\nFeature Importances\n===================\n'
    for row in important_list:
        clf_info += str(row) + "\n"
#         print(str(row))
    feature_df = pd.DataFrame(important_list, columns = ["COLUMN", "IMPORTANT_VALUE"])
    t0 = time()
    feature_df.to_csv(out_path + out_filename + "feature_important_descent.csv", index=False)
    #     print("time for output csv file: %.2f" % (time()-t0))
    cpy_dict = dict(important_list)
    cpy_dict["Groups"] = each_dir
    feature_df = pd.DataFrame(cpy_dict, index = [0])
    feature_df.to_csv(out_path + out_filename + "feature_important_one_row.csv", index=False)
    accuracy_dict = {}
    accuracy_dict["accuracy"] = gs.cv_results_["mean_test_score"][0]
    accuracy_df = pd.DataFrame(accuracy_dict, index = [0])
    accuracy_df.to_csv(out_path + out_filename + "_accuracy.csv", index=False)

    predict_y = clf.predict(train_x)
    cnf_matrix = confusion_matrix(train_numeric_y, predict_y )
    group_encoder = []
    for idx, row in enumerate(cnf_matrix):
        current_group = str(le.inverse_transform(idx))
        group_encoder.append(current_group)

    #     idx_count_in_group = len(test_df[test_df["Groups"] == current_group])
        idx_count_in_group = len(train_df[train_df["Groups"] == current_group])

        clf_info +=  "\n\n" + str("class = %s count = [%s / %s]" % (current_group, row[idx], idx_count_in_group))
        clf_info +=  "\n\n" + str("predict %s accurancy = %s" % (current_group, row[idx] / idx_count_in_group))
    #     print("class = %s count = [%s / %s]" % (current_group,row[idx],str(idx_count_in_group)))
    #     print("predict %s accurancy = %s" % (current_group, row[idx] / idx_count_in_group))
    #     print()
    cnf_df = pd.DataFrame(cnf_matrix)
    cnf_df.columns = group_encoder
    cnf_df.index = group_encoder
    cnf_df.to_csv(out_path + out_filename + "confusion_matrix.csv", index=False)
    md_info = clf_info.replace("\n", "<br>")
    with open(out_path + out_filename + 'readme.md', 'w+') as f:
         f.write(md_info)
    f.closed
    return gs.cv_results_["mean_test_score"][0]

In [5]:
def random_forest_test(train_df):
    train_x, train_y = train_df.iloc[:, 0:-1].values, train_df.iloc[:, -1].values
    le = preprocessing.LabelEncoder()
    le.fit(train_df["Groups"].unique())
    train_numeric_y = le.transform(train_y)
    rf = RandomForestClassifier(max_features='auto',
                                random_state=1,
                                n_jobs=-1,
                                n_estimators = 100)
#     param_grid = {"criterion" : ["gini"], 
#                   "min_samples_leaf" : [10], 
#                   "min_samples_split" : [2],
#                   "max_depth" : [25],
#                   "n_estimators": [100]}
    param_grid = {
                  "min_samples_leaf" : [10],                   
                  "n_estimators": [100]}
    gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=3)
    grid_clf = gs.fit(train_x, train_numeric_y)
    clf = grid_clf.best_estimator_
   
    return gs.cv_results_["mean_test_score"][0]

In [9]:
os.listdir(path)

['InstantMessage-High',
 'InstantMessage-SuperHigh',
 'Lifestyle',
 'No_significant_preference',
 'No_significant_preference(instant_message)',
 'Portal',
 'Social-media']

In [10]:
start_time = time()
path = "D:/0814_sample_SMOTE/"
df = pd.DataFrame()
for each_dir in os.listdir(path):
    all_csvs = glob(path + each_dir + "/" + "*.csv")
    CLASSIFIER = "RANDOM_FOREST"
    CURRENT_MODE = each_dir
    print(all_csvs)
    out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'
    for csv_file in all_csvs:
        train_df = pd.read_csv(csv_file, error_bad_lines=False)
        out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'                
        out_path = "clf_random_forest_model_kFold/"+ each_dir + "/" + csv_file[-5:-6] +"/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        train_df = shuffle(train_df)
        random_forest(train_df)
#         print(each_dir, "highest accuracy", csv_file[-5:-6])

['D:/0814_sample_SMOTE/InstantMessage-High\\0814_marketing_with_picked_group11_numeric_max_min_sample_InstantMessage-High_others.csv']
['D:/0814_sample_SMOTE/InstantMessage-SuperHigh\\0814_marketing_with_picked_group11_numeric_max_min_sample_InstantMessage-SuperHigh_others.csv']
['D:/0814_sample_SMOTE/Lifestyle\\0814_marketing_with_picked_group11_numeric_max_min_sample_Lifestyle_others.csv']
['D:/0814_sample_SMOTE/No_significant_preference\\0814_marketing_with_picked_group11_numeric_max_min_sample_No_significant_preference_others.csv']
['D:/0814_sample_SMOTE/No_significant_preference(instant_message)\\0814_marketing_with_picked_group11_numeric_max_min_sample_No_significant_preference(instant_message)_others.csv']
['D:/0814_sample_SMOTE/Portal\\0814_marketing_with_picked_group11_numeric_max_min_sample_Portal_others.csv']
['D:/0814_sample_SMOTE/Social-media\\0814_marketing_with_picked_group11_numeric_max_min_sample_Social-media_others.csv']


In [11]:
import requests
requests.post(
        "https://api.mailgun.net/v3/sandboxe9bb891a60414f4bae93f2cc55daa963.mailgun.org/messages",
        auth=("api", "key-a007a22faf334a3510137b6cc03c21a6"),
        data={"from": "Mailgun Sandbox <postmaster@sandboxe9bb891a60414f4bae93f2cc55daa963.mailgun.org>",
              "to": "Toby <atch84@gmail.com>",
              "subject": "Random Forest Finished",
              "text": "Fucking Finished"})

<Response [200]>

In [17]:
from mailerWithUtf8 import mail
test=mail()
test.main("random forest finished", "done")

Start Send Mail
User Email : a - DONE 
User Email : a - DONE 
User Email : 2 - DONE 
User Email : 6 - DONE 
User Email : 3 - DONE 
User Email : 6 - DONE 
User Email : 5 - DONE 
User Email : 5 - DONE 
User Email : 6 - DONE 
User Email : 6 - DONE 
User Email : @ - DONE 
User Email : g - DONE 
User Email : m - DONE 
User Email : a - DONE 
User Email : i - DONE 
User Email : l - DONE 
User Email : . - DONE 
User Email : c - DONE 
User Email : o - DONE 
User Email : m - DONE 
Finish Send Mail


# check clip features accuracy

In [None]:
start_time = time()
path = "C:/Users/VIPLAB/Desktop/preprocess_py/marketing_analyze/0731_sample/"
accu_path = "C:/Users/VIPLAB/Desktop/preprocess_py/clf_random_forest_model_kFold/"

df = pd.DataFrame()
for each_dir in dir_list:
    all_csvs = glob(path + each_dir + "/" + "*.csv")
    CLASSIFIER = "RANDOM_FOREST"
    CURRENT_MODE = each_dir
    print(all_csvs)
    out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'
    for csv_file in all_csvs:
        train_df = pd.read_csv(csv_file, error_bad_lines=False)
        for accr_file in glob(accu_path + each_dir + "/" + "*accuracy.csv"):
            accu_df = pd.read_csv(accr_file, error_bad_lines=False)
            origin_accu = float(accu_df["accuracy"][0])
        print("origin_accu =", origin_accu)

        for features_file in glob(accu_path + each_dir + "/" + "*descent.csv"):
            feature_df = pd.read_csv(features_file, error_bad_lines=False)
        feature_list = list(feature_df["COLUMN"])
        out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'                
        out_path = "clf_random_forest_model_kFold/"+ each_dir + "/" + csv_file[-5:-6] +"/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        for important_count in range(5, 100, 1):
            temp_list = feature_list[:important_count].copy()
            temp_list.append("Groups")
            simple_predict = float(random_forest_test(train_df[temp_list]))
            print("use pre %d cols accurancy = %s" % (important_count, simple_predict))
            if(simple_predict - origin_accu > -0.01):
                with open(accu_path + each_dir + "/clip_feature.txt", "w") as text_file:
                    text_file.write(str(important_count))
                break
#         print(each_dir, "highest accuracy", csv_file[-5:-6])

In [None]:
start_time = time()
path = "C:/Users/VIPLAB/Desktop/preprocess_py/marketing_analyze/0730_sample/"
df = pd.DataFrame()
dir_list = ["adult"]
for each_dir in dir_list:
    all_csvs = glob(path + each_dir + "/" + "*.csv")
    CLASSIFIER = "RANDOM_FOREST"
    CURRENT_MODE = each_dir
    print(all_csvs)
    out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'
    for csv_file in all_csvs:
        train_df = pd.read_csv(csv_file, error_bad_lines=False)
        out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'                
        out_path = "clf_random_forest_model_kFold/"+ each_dir + "/" + csv_file[-5:-6] +"/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)            
        print(random_forest(train_df))
#         print(each_dir, "highest accuracy", csv_file[-5:-6])

In [None]:
select_col = [
    "GENDER_CODE",
    "DATA_USAGE_MB",
    "P3M_AVG_DATA_USAGE_MB",
    "L3M_DATA_USAGE_MB",
    "IMEI_MKT_NAME",
    "IMEI_MFG_NAME",
    "AGE",
    "DATA_INV_AMT",
    "NET_INV_AMT",
    "TENURE_SCV",
    "L3M_AVG_NET_INV_AMT",
    "DATA_RC_AMT",
    "L3M_NET_INV_AMT",
    "BILL_ZIP_CODE",
    "ACTV_STORE_ID",
    "P3M_MO_PSTN_DUR",
    "P3M_MO_OFFNET_DUR",
    "MOST_MT_DUR",
    "MT_STM_AMT",
    "VOICE_INV_AMT",
    "Groups"
]

In [None]:
start_time = time()
path = "C:/Users/VIPLAB/Desktop/preprocess_py/marketing_analyze/0730_sample/"
df = pd.DataFrame()
dir_list = ["adult"]
for each_dir in dir_list:
    all_csvs = glob(path + each_dir + "/" + "*.csv")
    CLASSIFIER = "RANDOM_FOREST"
    CURRENT_MODE = each_dir
    print(all_csvs)
    out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'
    for csv_file in all_csvs:
#         train_df = pd.read_csv(csv_file, error_bad_lines=False)
        train_df = pd.read_csv(csv_file, error_bad_lines=False, usecols= select_col)
#         print(train_df.columns)
#         break
#         train_df = pd.read_csv(csv_file, error_bad_lines=False, usecols= select_col)
        out_filename = CLASSIFIER + "_" + CURRENT_MODE + '_'                
        out_path = "clf_random_forest_model_kFold/"+ each_dir + "/" + csv_file[-5:-6] +"/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        print(random_forest(train_df))
#         print(each_dir, "highest accuracy", csv_file[-5:-6])

In [None]:
out_path = "clf_random_forest_model_kFold/adult_test_vs_others/" + file_sample_count +"/"
if not os.path.exists(out_path):
    os.makedirs(out_path)

In [None]:
train_x, train_y = train_df.iloc[:, 0:-1].values, train_df.iloc[:, -1].values

In [None]:
# test_x, test_y = test_df.iloc[:, 0:-1].values, test_df.iloc[:, -1].values

# convert y to numeric# 

In [None]:
le = preprocessing.LabelEncoder()
le.fit(train_df["Groups"].unique())
train_numeric_y = le.transform(train_y)


In [None]:
# test_numeric_y = le.transform(test_y) 

# build Random Forest Classifier


In [None]:
start_time = time()


In [None]:
rf = RandomForestClassifier(max_features='auto',
                            oob_score=True,
                            random_state=1,
                            n_jobs=-1,
                            n_estimators = 100)



In [None]:
param_grid = {"criterion" : ["gini"], 
              "min_samples_leaf" : [10], 
              "min_samples_split" : [2],
              "max_depth" : [None],
              "n_estimators": [100]}

In [None]:
# param_grid = {"criterion" : ["gini"], 
#               "min_samples_leaf" : [10], 
#               "min_samples_split" : [2],
# #               "max_depth" : [10],
#               "n_estimators": [100]}

In [None]:
gs.cv_results_["mean_test_score"][0]

In [None]:
from winsound import Beep
Beep(440, 500) 
Beep(440, 500) 

In [None]:
gs.cv_results_

In [None]:
clf = grid_clf.best_estimator_

# saving classifier

In [None]:
# joblib.dump(clf, out_path + out_filename + 'CLF.pkl') 
# clf = joblib.load('filename.pkl') 


In [None]:
clf_info = str(("Accuracy on training set: %f" % gs.cv_results_["mean_test_score"][0])) + '\n'
# clf_info += str(("Accuracy on test set: %f" % clf.score(test_x, test_numeric_y))) + '\n'
clf_info += str(('fit time %s seconds' % format(time() - start_time))) + '\n'
print(clf_info)

In [None]:
# predict_y = clf.predict(test_x)

In [None]:
# predict_y 

In [None]:
# test_numeric_y

# get feature importances

In [None]:
important_dict = dict(zip(train_df.columns[:-1],clf.feature_importances_))

In [None]:
important_dict

In [None]:
important_list = sorted(important_dict.items(), key=lambda x: x[1])

In [None]:
important_list.reverse()

In [None]:
clf_info += '\n\nFeature Importances\n===================\n'
for row in important_list:
    clf_info += str(row) + "\n"
    print(str(row))

In [None]:
feature_df = pd.DataFrame(important_list, columns = ["COLUMN", "IMPORTANT_VALUE"])

In [None]:
t0 = time()
feature_df.to_csv(out_path + out_filename + "feature_important_descent.csv", index=False)
print("time for output csv file: %.2f" % (time()-t0))

In [None]:
# cpy_dict = dict(important_list)
# cpy_dict["Groups"] = target_groups
# feature_df = pd.DataFrame(cpy_dict, index = [0])

In [None]:
# cpy_dict = dict(important_list)
# # cpy_dict["Groups"] = target_groups
# feature_df = pd.DataFrame(cpy_dict, index = [0])

In [None]:
# t0 = time()
# feature_df.to_csv(out_path + out_filename + "feature_important_one_row.csv", index=False)
# print("time for output csv file: %.2f" % (time()-t0))

In [None]:
clf.n_classes_

In [None]:
clf.n_outputs_

In [None]:
predict_y = clf.predict(train_x)

In [None]:
# cnf_matrix = confusion_matrix(test_numeric_y, predict_y )
cnf_matrix = confusion_matrix(train_numeric_y, predict_y )
cnf_matrix

In [None]:
group_encoder = []
for idx, row in enumerate(cnf_matrix):
#     success predict
#     print(row[idx])
    current_group = str(le.inverse_transform(idx))
    group_encoder.append(current_group)
#     current_group = groups[idx]

    print(current_group)

#     idx_count_in_group = len(test_df[test_df["Groups"] == current_group])
    idx_count_in_group = len(train_df[train_df["Groups"] == current_group])

    clf_info +=  "\n\n" + str("class = %s count = [%s / %s]" % (current_group, row[idx], idx_count_in_group))
    clf_info +=  "\n\n" + str("predict %s accurancy = %s" % (current_group, row[idx] / idx_count_in_group))
    print("class = %s count = [%s / %s]" % (current_group,row[idx],str(idx_count_in_group)))
    print("predict %s accurancy = %s" % (current_group, row[idx] / idx_count_in_group))
    print()

In [None]:
cnf_df = pd.DataFrame(cnf_matrix)
cnf_df.columns = group_encoder
cnf_df.index = group_encoder

In [None]:
cnf_df

In [None]:
cnf_df.to_csv(out_path + out_filename + "confusion_matrix.csv", index=False)


In [None]:
md_info = clf_info.replace("\n", "<br>")
with open(out_path + out_filename + 'readme.md', 'w+') as f:
     f.write(md_info)
f.closed

In [None]:
plt.plot(clf.feature_importances_, 'o')  
# for i in 
plt.xticks(range(train_x.shape[1]), train_df.columns[:-1], rotation=90)  
plt.ylim(0, 1)  
plt.show()  

In [None]:
from winsound import Beep
Beep(440, 500) 
Beep(440, 500) 

In [None]:
from mailerWithUtf8 import mail
mail_info = clf_info.replace("\n", "<br>")
test=mail()
test.main("clf_info finished", mail_info)

# Check feature importances accurancy

In [None]:
feature_cols = []
limit_importances = 5
for idx, feature in enumerate(important_list):
    if(idx == limit_importances):
        break

    #     print(feature[0])
    feature_cols.append(feature[0])
feature_cols.append("Groups")

In [None]:
feature_cols

In [None]:
feature_train_x = train_df[feature_cols].iloc[:, 0:-1].values
feature_test_x = test_df[feature_cols].iloc[:, 0:-1].values

In [None]:
gs_feature = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_feature_clf = gs_feature.fit(feature_train_x, train_numeric_y)

In [None]:
clf_feature = grid_feature_clf.best_estimator_
clf_feature_info = str(("Accuracy on training set: %f" % clf_feature.score(feature_train_x, train_numeric_y))) + '\n'
clf_feature_info += str(("Accuracy on test set: %f" % clf_feature.score(feature_test_x, test_numeric_y))) + '\n'
clf_feature_info += str(('fit time %s seconds' % format(time() - start_time))) + '\n'
clf_feature_info += str(('feature selection numbers = %s' % str(limit_importances) + '\n'

print(clf_feature_info)

In [None]:
predict_feature_y = clf_feature.predict(feature_test_x)

In [None]:
cnf_matrix = confusion_matrix(test_numeric_y, predict_feature_y )
cnf_matrix

In [None]:
group_encoder = []
for idx, row in enumerate(cnf_matrix):
#     success predict
#     print(row[idx])
    current_group = str(le.inverse_transform(idx))
    group_encoder.append(current_group)
#     current_group = groups[idx]

    print(current_group)

    idx_count_in_group = len(test_df[test_df["Groups"] == current_group])
    clf_feature_info +=  "\n\n" + str("class = %s count = [%s / %s]" % (current_group, row[idx], idx_count_in_group))
    clf_feature_info +=  "\n\n" + str("predict %s accurancy = %s" % (current_group, row[idx] / idx_count_in_group))
    print("class = %s count = [%s / %s]" % (current_group,row[idx],str(idx_count_in_group)))
    print("predict %s accurancy = %s" % (current_group, row[idx] / idx_count_in_group))
    print()

In [None]:
cnf_pd = pd.DataFrame(cnf_matrix)
cnf_pd.columns = group_encoder
cnf_pd.index = group_encoder

In [None]:
cnf_pd

In [None]:
from winsound import Beep
Beep(440, 500) 
Beep(440, 500) 
Beep(440, 500) 
Beep(349, 350) 
Beep(523, 150) 
Beep(440, 500) 
Beep(349, 350) 
Beep(523, 150) 
Beep(440, 1000) 
Beep(659, 500) 
Beep(659, 500) 
Beep(659, 500) 
Beep(698, 350) 
Beep(523, 150) 
Beep(415, 500) 
Beep(349, 350) 
Beep(523, 150) 
Beep(440, 1000)