## Imports

In [None]:
import pandas as pd
import seaborn as sns
from scipy.stats import norm
import numpy as np
import matplotlib.pyplot as plt

from src.shared.constants import DATASET_NAME, RAW_DATA_PATH
from src.shared.infrastructure.json_data_loader import JSONDataLoader

## Load data as pandas DataFrame

In [None]:
# Load the raw data in pandas DataFrame
MCPL_data_path = f"{RAW_DATA_PATH}/{DATASET_NAME}.json"
print(RAW_DATA_PATH)
json_data_loader = JSONDataLoader()
MCPL_dataset = json_data_loader.load_data(MCPL_data_path)
MCPL_dataset_df = pd.DataFrame.from_dict(MCPL_dataset)
MCPL_dataset_df.head()

## Relationship between variables

In [None]:
# Create feature 'ratio_cols_rows'
ratio_cols_rows = MCPL_dataset_df.cols_number / MCPL_dataset_df.rows_number
MCPL_dataset_df["ratio_cols_rows"] = ratio_cols_rows
# Show correlation between the variables
sns.heatmap(MCPL_dataset_df.corr(), annot=True, fmt=".2f", cmap="YlGnBu")

In [None]:
sns.pairplot(MCPL_dataset_df)

## Distribution of max_char_per_line variable

In [None]:
# Histogram
sns.distplot(MCPL_dataset_df["max_char_per_line"])

In [None]:
# Numpy fuction log1p which applies log(1+x) to all elements of the column
sns.distplot(np.log1p(MCPL_dataset_df.max_char_per_line), fit=norm)

In [None]:
# MCPL_log = np.log1p(MCPL_data_df.max_char_per_line)
# MCPL_log
# np.expm1(MCPL_log)

In [None]:
import scipy.stats as stats

stats.probplot(MCPL_dataset_df.max_char_per_line, dist="norm", plot=plt)
plt.show()