### Imports

In [1]:
import pandas as pd
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.linear_model as lm
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier


### Funções

In [2]:
def load_data(file_path):
    return pd.read_parquet(file_path)

def df_summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Column'] = summary['index']
    summary = summary[['Column','dtypes']]
    summary['# Missing'] = df.isnull().sum().values    
    summary['# Unique'] = df.nunique().values
    summary['Example'] = df.loc[0].values

    for name in summary['Column'].value_counts().index:
        summary.loc[summary['Column'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary



### Carregando dados

In [3]:
seed = 42
file_path = '../data/outputs/costumers_feature_df.parquet'
file_path2 = '../data/outputs/costumers_feature_df.parquet'
df = load_data(file_path2)
##display(df)

rating = df['review_score_factor']

drop_columns = ['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', \
                'order_delivered_customer_date', 'order_estimated_delivery_date', 'review_id', 'review_comment_title', 'review_score', \
                'review_creation_date', 'review_comment_message', 'review_answer_timestamp', 'payment_sequential', 'payment_type', 'customer_unique_id', \
                'customer_zip_code_prefix', 'customer_city', 'customer_state', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', \
                'freight_value', 'product_category_name', 'product_category_name_english', 'product_length_cm', 'product_height_cm', 'product_width_cm', \
                'seller_zip_code_prefix', 'seller_city', 'seller_state', 'review_score_factor']

features = df.drop(drop_columns, axis=1)
features = features.fillna(features.mean())

### Dividindo os dados

In [4]:

x_train, x_test, y_train, y_test = train_test_split(features, rating, test_size = 0.2, random_state = seed)

### Criação e treinamento do modelo de Árvore de decisão

In [5]:
clf = DecisionTreeClassifier(random_state=seed)
clf.fit(x_train, y_train)

# Realização de previsões
y_pred = clf.predict(x_test)

# Avaliação do modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'The accuracy of the DT model is {accuracy}')

# Exibir relatório de classificação
print(report)

The accuracy of the KNN model is 0.5799238820171265
              precision    recall  f1-score   support

         0.0       0.52      0.55      0.54      3693
         1.0       0.42      0.42      0.42      6491
         2.0       0.68      0.67      0.67     12938

    accuracy                           0.58     23122
   macro avg       0.54      0.55      0.54     23122
weighted avg       0.58      0.58      0.58     23122



### Criação e treinamento do modelo de regressão logística

In [7]:
# Criar e treinar o modelo de regressão logística
model = LogisticRegression(max_iter=1000, random_state=seed)
model.fit(x_train, y_train)

# Fazer previsões e avaliar o modelo
y_pred_lr = model.predict(x_test)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

         0.0       0.61      0.21      0.32      3693
         1.0       0.38      0.02      0.03      6491
         2.0       0.58      0.97      0.73     12938

    accuracy                           0.58     23122
   macro avg       0.52      0.40      0.36     23122
weighted avg       0.53      0.58      0.47     23122



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Criação e treinamento do modelo com regressão linear

In [None]:

reg = lm.LinearRegression().fit(x_train, y_train)

y_pred_lr = reg.predict(x_test)

score = reg.score(x_test, y_test)
rmse_lr = np.sqrt(mean_squared_error(y_test.values, y_pred_lr))

print(f'The R2 score in this linear regression model is {round(score, 5)}')
print(f'The RMSE in this linear regression model is {round(rmse_lr, 5)}, which is about {100*round(rmse_lr/np.mean(rating),2)}% of \
the mean rating score.')

### Criação e treinamento do modelo com floresta aleatória

In [6]:
# Criar e treinar o modelo de floresta aleatória
model = RandomForestClassifier(n_estimators=100, random_state=seed)
model.fit(x_train, y_train)

# Fazer previsões 
y_pred_rf = model.predict(x_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred_rf)
print(f'The accuracy of the RF model is {round(accuracy, 5)}')

# Exibir relatório de classificação
print(classification_report(y_test, y_pred_rf))

The accuracy of the KNN model is 0.67715
              precision    recall  f1-score   support

         0.0       0.82      0.57      0.67      3693
         1.0       0.61      0.26      0.37      6491
         2.0       0.67      0.91      0.77     12938

    accuracy                           0.68     23122
   macro avg       0.70      0.58      0.60     23122
weighted avg       0.67      0.68      0.64     23122



### Criação e treinamento do modelo utilizando KNN

In [8]:
# Criar e treinar o modelo K-Nearest Neighbors
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(x_train, y_train)

# Fazer previsões
y_pred_knn = model_knn.predict(x_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred_knn)
print(f'The accuracy of the KNN model is {round(accuracy, 5)}')

# Exibir relatório de classificação
print(classification_report(y_test, y_pred_knn))

The accuracy of the KNN model is 0.53404
              precision    recall  f1-score   support

         0.0       0.39      0.33      0.36      3693
         1.0       0.36      0.30      0.33      6491
         2.0       0.63      0.71      0.67     12938

    accuracy                           0.53     23122
   macro avg       0.46      0.45      0.45     23122
weighted avg       0.52      0.53      0.52     23122



### Criação e treinamento do modelo utilizando GBM

In [5]:
# Criar e treinar o modelo Gradient Boosting
model_gbm = GradientBoostingClassifier(n_estimators=100, random_state=seed)
model_gbm.fit(x_train, y_train)

# Fazer previsões
y_pred_gbm = model_gbm.predict(x_test)

# Avaliar o modelo
accuracy = accuracy_score(y_test, y_pred_gbm)
print(f'The accuracy of the Gradient Boosting model is {round(accuracy, 5)}')

# Exibir relatório de classificação
print(classification_report(y_test, y_pred_gbm))

The accuracy of the Gradient Boosting model is 0.60687
              precision    recall  f1-score   support

         0.0       0.73      0.34      0.46      3693
         1.0       0.44      0.02      0.04      6491
         2.0       0.60      0.98      0.74     12938

    accuracy                           0.61     23122
   macro avg       0.59      0.45      0.42     23122
weighted avg       0.57      0.61      0.50     23122

