In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

class DataProcessor:
    def __init__(self):
        self.file = None
    
    def loadFile(self): 
        flag = False
        while not flag:
            fileName = input("Enter the name of the file (with the extension): ")
            if fileName[-4:] == '.csv':
                self.file = pd.read_csv(fileName)
                flag = True
            elif fileName[-5:] == '.xlsx':
                self.file = pd.read_excel(fileName)
                flag = True
            elif fileName[-5:] == '.json':
                self.file = pd.read_json(fileName)
                flag = True
            elif fileName[-5:] == '.html':
                self.file = pd.read_html(fileName)
                flag = True
            elif fileName[-4:] == '.txt':
                self.file = pd.read_txt(fileName)
                flag = True
            elif fileName[-4:] == '.sql':
                self.file = pd.read_sql(fileName)
                flag = True
            else:
                print('File type not supported. Please try again.')
        return self.file
    
    def fileDescription(self):
        numeric_columns = self.file.select_dtypes(include=['int64', 'float64'])
        mean = numeric_columns.mean()
        median = numeric_columns.median()
        standard_deviation = numeric_columns.std()
        most_frequent = numeric_columns.mode()
        min_value = numeric_columns.min()
        max_value = numeric_columns.max()

        print('Average:', mean, '\n\nMedian:', median, '\n\nStandard Deviation:', standard_deviation, '\n\nMost Frequent:', most_frequent,'\n\n Min:', min_value,'\n\n Max:', max_value)
    
    def handleMissingValues(self):
        flag = False
        while not flag:
            choice = input('There are missing values. How do you want to handle them?\n 1. Replace with zeros\n 2. Replace with a number\n 3. Fill with previous value\n 4. Fill with next value\n 5. Fill with max value\n 6. Fill with min value\n 7. Leave them as they are\n')
            if choice == '1':
                flag = True
            elif choice == '2':
                replacement = float(input('Enter the replacement value: '))
                self.file.fillna(replacement, inplace=True)
                flag = True
            elif choice == '3':
                self.file = self.file.ffill()
                flag = True   
            elif choice == '4':
                self.file = self.file.bfill()
                flag = True 
            elif choice == '5':
                self.file.fillna(value=self.file.max(), inplace=True)
                flag = True 
            elif choice == '6':
                self.file.fillna(value=self.file.min(), inplace=True)
                flag = True 
            elif choice == '7':
                flag = True
            else:
                print('invalid choice. Please choose one of the available options')
        return self.file
    
    def encodeCategoricalData(self):
        categorical_column = int(input('Enter the column number that has categorical data (starting from 0): '))
        categorical_array = self.file.iloc[:, [categorical_column]].values
        
        ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
        encoded = ct.fit_transform(categorical_array)
        
        return encoded

# Create an instance of the class and call the methods
processor = DataProcessor()
processor.loadFile()
processor.fileDescription()

if processor.file.isna().any().any():
    processor.handleMissingValues()
    
encoded = processor.encodeCategoricalData()
print(encoded)

Enter the name of the file (with the extension):  Book2.csv


Average: age                     26.666667
year of graduation    2015.200000
dtype: float64 

Median: age                     19.5
year of graduation    2025.0
dtype: float64 

Standard Deviation: age                   15.187714
year of graduation    15.849290
dtype: float64 

Most Frequent:    age  year of graduation
0   13              1991.0
1   17              2007.0
2   19              2025.0
3   20              2026.0
4   40              2027.0
5   51                 NaN 

 Min: age                     13.0
year of graduation    1991.0
dtype: float64 

 Max: age                     51.0
year of graduation    2027.0
dtype: float64


There are missing values. How do you want to handle them?
 1. Replace with zeros
 2. Replace with a number
 3. Fill with previous value
 4. Fill with next value
 5. Fill with max value
 6. Fill with min value
 7. Leave them as they are
 6
Enter the column number that has categorical data (starting from 0):  4


[[0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]
