In [None]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

from processing import encoding, detection, outliers

parquet_file_path = '../xxx.parquet'

df = pd.read_parquet(parquet_file_path)


In [None]:


with pd.option_context('display.max_rows', None):
    display(HTML("<h3>Column Data Types</h3>"))
    display(df.dtypes)

    category_like_columns = [col for col in df.columns if df[col].dtype == 'object' and detection.all_strings(df[col])]
    display(HTML("<h3>Categorical columns</h3>"))
    display(category_like_columns)

    list_like_columns = [col for col in df.columns if detection.contains_list(df[col])]
    display(HTML("<h3>List columns</h3>"))
    display(list_like_columns)

    float_like_columns = df.select_dtypes(include=['float64', 'float32']).columns.tolist()
    display(HTML("<h3>Float like columns</h3>"))
    display(float_like_columns)

for col in category_like_columns:
    df[col] = df[col].astype('category')


In [None]:
from processing import encoding

exclude_list_columns = []
list_like_columns_for_analysis = [col for col in list_like_columns if col not in exclude_list_columns]
only_lists_df = df[list_like_columns_for_analysis].copy()

exclude_categorical_columns = []
categorical_columns_for_analysis = [col for col in category_like_columns if col not in exclude_categorical_columns]
category_like_df = df[categorical_columns_for_analysis].copy()

exclude_float_columns = []
float_columns_for_analysis = [col for col in float_like_columns if col not in exclude_float_columns]
float_like_df = df[float_columns_for_analysis].copy()


In [None]:

display(HTML("<h3>Displaying list types</h3>"))

for column_to_unnest in list_like_columns_for_analysis:
    temp_df = df[['product_name', column_to_unnest]].copy()
    temp_df_exploded = temp_df.explode(column_to_unnest)
    temp_df_exploded[column_to_unnest] = temp_df_exploded[column_to_unnest].astype('category')

    display(temp_df_exploded[column_to_unnest].describe(include='category'))

    display(HTML("<br>"))

In [None]:
for column in float_columns_for_analysis:
    _, outlier_indices = outliers.find_outliers(df[column])

    df.loc[outlier_indices, column] = np.nan

    outliers.histplot(df[column])

In [None]:


from processing.correlation import test_correlation

numerical_data = df[float_columns_for_analysis]

significant_results = pd.DataFrame(columns=['Variable 1', 'Variable 2', 'Correlation Coefficient', 'P-Value'])
rows_list = []

alpha = 0.01

for col1 in float_columns_for_analysis:
    for col2 in float_columns_for_analysis:
        if col1 != col2:
            r, p = test_correlation(numerical_data, col1, col2)
            if p < alpha and r != 1.0:
                row = {
                    "Variable 1": col1,
                    "Variable 2": col2,
                    "Correlation Coefficient": r,
                    "P-Value": p
                }
                rows_list.append(row)

significant_results = pd.concat([significant_results, pd.DataFrame(rows_list)], ignore_index=True)

significant_results.sort_values(by='Correlation Coefficient', ascending=False, inplace=True)

display(HTML(significant_results.t
o_html(index=False, escape=False)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numerical_data = df[float_columns_for_analysis]
corr_matrix = numerical_data.corr()

plt.figure(figsize=(20, 20)) 

sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.title('Correlation Matrix of Numerical Variables')
plt.show()

In [None]:
import seaborn as sns
import numpy as np

subset_data = df[float_columns_for_analysis].sample(frac=0.1).copy()

sns.pairplot(subset_data)
plt.show()

In [None]:

only_lists_transformed_df = encoding.encode_list_like(only_lists_df, list_like_columns)
category_transformed_df = encoding.encode_categorical(category_like_df, category_like_columns)


In [None]:

with pd.option_context('display.max_columns', None):
    display(only_lists_transformed_df.describe())
    display(category_transformed_df.describe())
    display(float_like_df.describe())

    display(list_like_columns_for_analysis)
    display(categorical_columns_for_analysis)
    display(float_columns_for_analysis)

In [None]:
import pandas as pd

combined_df = pd.concat([only_lists_transformed_df, category_transformed_df, float_like_df], axis=1)


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

X = combined_df 

imputer = SimpleImputer(strategy='median')
imputed = imputer.fit_transform(X)
imputed_df = pd.DataFrame(imputed, columns=X.columns)    

scaler = StandardScaler()
combined_scaled = scaler.fit_transform(imputed_df)

In [None]:
pca_result = PCA()
pca_result.fit(combined_scaled)

explained_variance = pca_result.explained_variance_ratio_

plt.figure(figsize=(10, 5))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, len(explained_variance) + 1), np.cumsum(explained_variance), where='mid', label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.title('Scree Plot')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

n_components = np.argmax(np.cumsum(explained_variance) >= 0.50) + 1
print("Number of components to retain to explain at least 50% of the variance:", n_components)

In [None]:
from processing import pca

pca.plot_correlation_circle(pca_result, list_like_columns_for_analysis, title = "Correlation on list like columns")
pca.plot_correlation_circle(pca_result, categorical_columns_for_analysis, title = "Correlation on categorical like columns")
pca.plot_correlation_circle(pca_result, float_columns_for_analysis, title = "Correlation on float like columns")
pca.plot_correlation_circle(pca_result, float_columns_for_analysis + categorical_columns_for_analysis + list_like_columns_for_analysis, title = "All columns")
pca.plot_correlation_circle(pca_result, list_like_columns_for_analysis, component1=1, component2=2, title = "Correlation on list like columns 1 - 2")
pca.plot_correlation_circle(pca_result, categorical_columns_for_analysis, component1=1, component2=2, title = "Correlation on categorical like columns 1 - 2")
pca.plot_correlation_circle(pca_result, float_columns_for_analysis, component1=1, component2=2, title = "Correlation on float like columns 1 - 2")
pca.plot_correlation_circle(pca_result, float_columns_for_analysis + categorical_columns_for_analysis + list_like_columns_for_analysis, component1=1, component2=2, title = "All columns 1-2")


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
analysis_df = pd.DataFrame(combined_scaled.copy(), columns=X.columns)

y = analysis_df['xxx']

analysis_df.drop(columns=["xx", "xx"], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(analysis_df, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42, max_depth=8)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

feature_importances = model.feature_importances_
features = analysis_df.columns
sorted_indices = np.argsort(feature_importances)[::-1]

plt.figure(figsize=(30, 30))
plt.title("Feature Importance")
plt.bar(range(X_train.shape[1]), feature_importances[sorted_indices], align="center")
plt.xticks(range(X_train.shape[1]), [features[i] for i in sorted_indices], rotation=90)
plt.show()

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

tree = model.estimators_[0]

plt.figure(figsize=(30, 30))

plot_tree(tree,
          feature_names=analysis_df.columns,
          filled=True,
          rounded=True,
          max_depth=4,
          fontsize=10)

plt.show()