In [1]:
from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn


In [43]:

class DataProcessor:
    def __init__(self):
        pass

    def read_file(self, file_path):
        file_type = file_path[-4:]

        if file_type == '.csv':
            return pd.read_csv(file_path)
        elif file_type in ['.xlsx', '.xls']:
            return pd.read_excel(file_path)
        elif file_type == '.json':
            return pd.read_json(file_path)
        else:
            raise ValueError(f"Oops, unsupported file type: {file_type}\nOnly supported file types at the moment are:\n1- .csv\n2- .xlsc\n3- .xls\n4- .json")

    def data_summary(self, df):
        stats = []

        for col in df.columns:
            col_type = df[col].dtype

            if col_type in ['int64', 'float64']:
                mean = df[col].mean()
                median = df[col].median()
                mode = df[col].mode()[0]
                fst_qrt = df[col].quantile(0.25)
                third_qrt = df[col].quantile(0.75)
                min_val = df[col].min()
                max_val = df[col].max()
                duplicated = df[col].duplicated().sum()
                null_values = df[col].isnull().sum()
                cats = None
                cats_freq = None
            elif col_type == 'object':
                mean = None
                median = None
                mode = df[col].mode()[0]
                fst_qrt = None
                third_qrt = None
                min_val = None
                max_val = None
                duplicated = df[col].duplicated().sum()
                null_values = df[col].isnull().sum()
                cats = df[col].nunique()
                cats_freq = df[col].value_counts().to_dict()
            else:
                continue

            smry = pd.DataFrame({
                'column_name': [col],
                'column_type': [col_type],
                'mean': [mean],
                'median': [median],
                'mode': [mode],
                '1st_quartile': [fst_qrt],
                '3rd_quartile': [third_qrt],
                'min': [min_val],
                'max': [max_val],
                'duplicates': [duplicated],
                'null_values': [null_values],
                'categories': [cats],
                'categories_frequencies': [cats_freq]
            })

            stats.append(smry)

        smry = pd.concat(stats, ignore_index=True)

        return smry

    def handle_missing_values(self, df):
        if df.isnull().sum().sum() != 0:
            print("!!!THE DATASET CONTAINS SOME MISSING VALUES!!!")

            cols = df.columns[df.isnull().any()]

            print("Columns containing missing values:")
            print(cols)

            for col in cols:
                if df[col].isnull().mean() * 100 > 50:
                    df.drop(col, axis=1, inplace=True)
                    print(f"The '{col}' column was dropped because more than 50% of the column's values are missing")
                elif df[col].dtype in ['int64', 'float64']:
                    df[col].fillna(df[col].mean(), inplace=True)
                else:
                    mode_value = df[col].mode()[0]
                    df[col].fillna(mode_value, inplace=True)
                    print(f"The '{col}' column's missing values were filled with the mode of the column")

            if df.isnull().sum().sum() == 0:
                print("THE DATASET DOESN'T CONTAIN ANY MISSING VALUES ANYMORE!! :)")
        else:
            print("THE DATASET DOESN'T CONTAIN ANY MISSING VALUES!! :)")

        return df

    def encode_categorical(self, df):
        obj_cols = df.select_dtypes(include=['object']).columns
        encoded_df = pd.get_dummies(df, columns=obj_cols)
        return encoded_df


In [44]:
processor = DataProcessor()

df = processor.read_file('/content/drive/MyDrive/Complaints.csv')

df_summary = processor.data_summary(df)
print(df_summary)

cleaned_df = processor.handle_missing_values(df)
print(cleaned_df)

encoded_df = processor.encode_categorical(cleaned_df)
print(encoded_df)


               column_name column_type      mean median  \
0                  CASE_ID      object      None   None   
1               OFFER_NAME      object      None   None   
2            CUSTOMER_TYPE      object      None   None   
3           CUSTOMER_GROUP      object      None   None   
4           CURRENT_STATUS      object      None   None   
5          ESCALATION_FLAG      object      None   None   
6          ESCALATED_GROUP      object      None   None   
7                OPEN_DATE      object      None   None   
8                OPEN_USER      object      None   None   
9               CLOSE_DATE      object      None   None   
10             CLOSE_GROUP      object      None   None   
11              CLOSE_USER      object      None   None   
12             AGE_BRACKET     float64  1.503302    1.0   
13        ACTUAL_COMPLAINT      object      None   None   
14      CALLBACK_MECHANISM      object      None   None   
15              RESOLUTION      object      None   None 