In [74]:
# Import modules
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [75]:
# Read csv file
df_original = pd.read_csv("synthetic_sample_data_minimal.csv")
print("Original DataFrame:")
print(df_original.head(29))

Original DataFrame:
     Movie Id            Genre Release Date    Rating  \
0   TOYHANVUR           Horror   07/07/1985  9.750176   
1   CORCILSLF      Documentary   05/08/1999  7.912876   
2   UBWUVIHEL        Adventure   08/05/1996  3.116053   
3   CGHMHKJHH          Fantasy   21/07/2018  8.811783   
4   WMLWHTAMN          Fantasy   28/10/2011  5.250991   
5   REKCPGLOJ           Comedy   04/06/2013  8.676586   
6   GSDOQQRPD          Western   25/12/1995  9.191819   
7   VMIYHZNZJ        Adventure   23/03/2018  9.459478   
8   XVHGPYBBL          Mystery   19/06/2002  9.125866   
9   NNZFDYZTH          History   16/02/2019  9.232197   
10  VZTEXQGBB           Comedy   22/01/2018  8.912548   
11  XSPIAJYJM          Fantasy   19/11/1985  8.561358   
12  VFXKEREAP          Fantasy   29/07/1992  7.742613   
13  MCCASXIDK            Crime   26/12/2003  9.727619   
14  MTUOJCLUA           Horror   03/03/1989  6.913383   
15  DBQTOTADE            Drama   23/10/1990  8.276999   
16  TKBEDDC

### Missing Value Handler Module Testing

In [76]:
from data_cleaner.missing_value_handler import MissingValueHandler

# Initialize the MissingValueHandler
mv_handler = MissingValueHandler(df_original.copy())

# Identify missing values
missing_values = mv_handler.identify_missing()
print("Missing Values:\n", missing_values)

# Impute missing values with mean for specific columns
df_mean_imputed = df_original.copy()
mv_handler_mean = MissingValueHandler(df_mean_imputed)
df_imputed_mean = mv_handler_mean.impute_mean(columns = ['Budget in USD'])
print("Data after mean imputation:\n", df_imputed_mean['Budget in USD'].head(29))

# Reload the dataset for median imputation
df_median_imputed = pd.read_csv('synthetic_sample_data_minimal.csv')
mv_handler_median = MissingValueHandler(df_median_imputed)
df_imputed_median = mv_handler_median.impute_median(columns=['Budget in USD'])
print("Data after median imputation:\n", df_imputed_median['Budget in USD'].head(29))

# Impute missing values with a constant value (e.g., 0) for specific columns
# Reload the dataset for constant imputation
df_constant_imputed = pd.read_csv('synthetic_sample_data_minimal.csv')
mv_handler_constant = MissingValueHandler(df_constant_imputed)
df_imputed_constant = mv_handler_constant.impute_constant(0, columns=['Budget in USD'])
print("Data after constant imputation:\n", df_imputed_constant['Budget in USD'].head(29))

# Drop rows with any missing values
# Reload the dataset for dropping missing values
df_dropped_missing = pd.read_csv('synthetic_sample_data_minimal.csv')
mv_handler_drop = MissingValueHandler(df_dropped_missing)
df_dropped_missing = mv_handler_drop.drop_missing()
print("Data after dropping missing values:\n", df_dropped_missing['Budget in USD'].head(29))

Missing Values:
     Movie Id  Genre  Release Date  Rating  Summary  Shooting Location  \
0      False  False         False   False    False              False   
1      False  False         False   False    False              False   
2      False  False         False   False    False              False   
3      False  False         False   False    False              False   
4      False  False         False   False    False              False   
5      False  False         False   False    False              False   
6      False  False         False   False    False              False   
7      False  False         False   False    False              False   
8      False  False         False   False    False              False   
9      False  False         False   False    False              False   
10     False  False         False   False    False              False   
11     False  False         False   False    False              False   
12     False  False         False 

### Outlier Handler Module Testing

In [77]:
from data_cleaner.outlier_handler import OutlierHandler

# Initialize the OutlierHandler
outlier_handler = OutlierHandler(df_original.copy())

# Identify outliers
# Reload the dataset
df_identify_outliers = pd.read_csv('synthetic_sample_data_minimal.csv')
outlier_handler_identify = OutlierHandler(df_identify_outliers)
outliers = outlier_handler_identify.identify_outliers(['Rating'])
print("Outliers:\n", outliers)

# Remove outliers
# Reload the dataset
df_identify_outliers_remove = pd.read_csv('synthetic_sample_data_minimal.csv')
outlier_handler_remove= OutlierHandler(df_identify_outliers_remove)
removed_outliers = outlier_handler_remove.remove_outliers(['Rating'])
print("Removed Outliers:\n", removed_outliers['Rating'])

# Replace with iqr
# Reload the dataset
df_identify_outliers_replace = pd.read_csv('synthetic_sample_data_minimal.csv')
outlier_handler_replace= OutlierHandler(df_identify_outliers_replace)
replaced_outliers_df = outlier_handler_replace.replace_outliers_with_iqr(columns=['Rating'])
print("DataFrame after Replacing Outliers with IQR:\n", replaced_outliers_df['Rating'])

Outliers:
     Rating
0    False
1    False
2     True
3    False
4     True
5    False
6    False
7    False
8    False
9    False
10   False
11   False
12   False
13   False
14   False
15   False
16   False
17   False
18   False
19   False
20   False
21   False
22   False
23   False
24   False
25   False
26   False
27   False
28   False
Removed Outliers:
 0     9.750176
1     7.912876
3     8.811783
5     8.676586
6     9.191819
7     9.459478
8     9.125866
9     9.232197
10    8.912548
11    8.561358
12    7.742613
13    9.727619
14    6.913383
15    8.276999
16    7.591583
17    9.603778
18    9.801080
19    9.227341
20    7.858370
21    5.395632
22    8.336223
23    7.216878
24    9.397201
25    9.852601
26    9.090585
27    9.592543
28    5.720490
Name: Rating, dtype: float64
DataFrame after Replacing Outliers with IQR:
 0     9.750176
1     7.912876
2     8.811783
3     8.811783
4     8.811783
5     8.676586
6     9.191819
7     9.459478
8     9.125866
9     9.232197
10    8.91

### Scaler Module Testing

In [78]:
from data_cleaner.scaler import Scaler

# Min max scaling
scaler = Scaler(df_original.copy())
min_max_scaled_df = scaler.min_max_scale(columns=['Rating'])
print("Min-Max Scaled DataFrame:\n", min_max_scaled_df['Rating'])

# Standart scaling
# Reload the dataset
df_standart_scale = pd.read_csv('synthetic_sample_data_minimal.csv')
standart_scale = Scaler(df_identify_outliers)
standard_scaled_df = standart_scale.standard_scale(columns=['Rating'])
print("Standard Scaled DataFrame:\n", standard_scaled_df['Rating'])

Min-Max Scaled DataFrame:
 0     0.984796
1     0.712060
2     0.000000
3     0.845497
4     0.316919
5     0.825428
6     0.901911
7     0.941643
8     0.892121
9     0.907905
10    0.860455
11    0.808323
12    0.686785
13    0.981447
14    0.563691
15    0.766111
16    0.664366
17    0.963064
18    0.992352
19    0.907184
20    0.703968
21    0.338390
22    0.774903
23    0.608743
24    0.932399
25    1.000000
26    0.886883
27    0.961396
28    0.386613
Name: Rating, dtype: float64
Standard Scaled DataFrame:
 0     0.915482
1    -0.208226
2    -3.142005
3     0.341553
4    -1.836258
5     0.258866
6     0.573986
7     0.737689
8     0.533649
9     0.598682
10    0.403182
11    0.188391
12   -0.312361
13    0.901686
14   -0.819525
15    0.014475
16   -0.404732
17    0.825944
18    0.946615
19    0.595712
20   -0.241563
21   -1.747794
22    0.050697
23   -0.633905
24    0.699600
25    0.978126
26    0.512070
27    0.819073
28   -1.549109
Name: Rating, dtype: float64


### Text Cleaner Module Testing

In [79]:
from data_cleaner.text_cleaner import TextCleaner

print("Convert to lowercase:\n")
for index, text in enumerate(df_original['Summary']):
    text_cleaner = TextCleaner(text)
    text = text_cleaner.lowercase()
    print(str(index) + ") "+ str(text))
    
print("\nNo punctuation:\n")
for index, text in enumerate(df_original['Summary']):
    text_cleaner = TextCleaner(text)
    text = text_cleaner.remove_punctuation()
    print(str(index) + ") "+ str(text))    
    
    
print("\nNo stopwords:\n")
for index, text in enumerate(df_original['Summary']):
    text_cleaner = TextCleaner(text)
    text = text_cleaner.remove_stopwords()
    print(str(index) + ") "+ str(text)) 
    
print("\nStemmed:\n")
for index, text in enumerate(df_original['Summary']):
    text_cleaner = TextCleaner(text)
    text = text_cleaner.stem_text()
    print(str(index) + ") "+ str(text)) 
    
    
print("\nLemmatized:\n")
for index, text in enumerate(df_original['Summary']):
    text_cleaner = TextCleaner(text)
    text = text_cleaner.lemmatize_text()
    print(str(index) + ") "+ str(text)) 

Convert to lowercase:

0) a group of college students get more than they bargained for when they spend a weekend at a haunted house.
1) a documentary that sheds light on the devastating effects of climate change on our planet.
2) a lost civilization is rediscovered deep within the amazon rainforest, revealing secrets of the past.
3) a young inventor builds a time machine and embarks on a journey through history to save the future.
4) a young inventor builds a time machine and embarks on a journey through history to save the future.
5) a heartwarming story about a dog who brings two lonely people together.
6) this historical drama explores the pivotal events that led to the american revolution.
7) in a post-apocalyptic world overrun by zombies, a group of survivors must band together to find sanctuary and rebuild civilization.
8) when a series of strange events plague a small town, a group of teenagers must uncover the truth behind the mystery before it's too late.
9) a group of musicia

### Categorical Encoder Module Testing

In [80]:
from data_cleaner.categorical_encoder import CategoricalEncoder

categorical_encoder = CategoricalEncoder()

#Test one_hot_encoded
genre_data = df_original.copy()[['Genre']]
one_hot_encoded_genre = categorical_encoder.fit_transform_one_hot(genre_data)
print("One-Hot Encoded Genre:\n", one_hot_encoded_genre)

#Test label_encoded

# Label Encoding
label_encoded_genre = categorical_encoder.fit_transform_label(df_original.copy()['Genre'])

# Display the results
print("Label Encoded Genre:\n", label_encoded_genre)

One-Hot Encoded Genre:
 [[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0.

### Data Type Converter Testing

In [81]:
from data_cleaner.data_type_converter import DataTypeConverter

print(df_original.dtypes)
converter = DataTypeConverter()

categorical_columns = ['Genre']
df = converter.convert_to_categorical(df_original.copy(), categorical_columns)
print("Converted to categorical:\n", df[categorical_columns].dtypes)

Movie Id              object
Genre                 object
Release Date          object
Rating               float64
Summary               object
Shooting Location     object
Budget in USD        float64
Awards                 int64
Popular                int64
dtype: object
Converted to categorical:
 Genre    category
dtype: object


### Date Time Handler Testing

In [82]:
from data_cleaner.date_time_handler import DateTimeHandler

print(df['Release Date'])
# Inıtialize DateTimeHandler()
date_handler = DateTimeHandler('Release Date')

# Convert to datetime format
print("\nConvert to datetime format\n")
df = date_handler.convert_to_datetime(df, format='%d/%m/%Y')
print(df['Release Date'])

#Extracted the date parts
print("\nExtracted version of date parts:\n")
df = date_handler.extract_date_parts(df)
print(df['Release Date'])

# Filter_by_date_range
print("\nFilter by date range: \n")
start_date = '07-07-1985'
end_date = '07-07-1989'
df_filtered = date_handler.filter_by_date_range(df, start_date, end_date)
print(df_filtered['Release Date'])

# Add 10 days of all dates
print("\nAdded 10 days version : \n")
days_to_add = 10
df = date_handler.add_days(df, days_to_add)

print(df['Release Date'])


0     07/07/1985
1     05/08/1999
2     08/05/1996
3     21/07/2018
4     28/10/2011
5     04/06/2013
6     25/12/1995
7     23/03/2018
8     19/06/2002
9     16/02/2019
10    22/01/2018
11    19/11/1985
12    29/07/1992
13    26/12/2003
14    03/03/1989
15    23/10/1990
16    26/06/2009
17    14/12/2005
18    12/08/1995
19    20/11/2015
20    02/08/2005
21    15/08/1997
22    06/12/2001
23    04/02/2002
24    02/11/1991
25    02/12/1991
26    18/09/1991
27    25/01/1993
28    02/05/2019
Name: Release Date, dtype: object

Convert to datetime format

0    1985-07-07
1    1999-08-05
2    1996-05-08
3    2018-07-21
4    2011-10-28
5    2013-06-04
6    1995-12-25
7    2018-03-23
8    2002-06-19
9    2019-02-16
10   2018-01-22
11   1985-11-19
12   1992-07-29
13   2003-12-26
14   1989-03-03
15   1990-10-23
16   2009-06-26
17   2005-12-14
18   1995-08-12
19   2015-11-20
20   2005-08-02
21   1997-08-15
22   2001-12-06
23   2002-02-04
24   1991-11-02
25   1991-12-02
26   1991-09-18
27   1993-01

### Feature Engineer Testing

In [83]:
from data_cleaner.feature_engineer import FeatureEngineer

# Initialize FeatureEngineer
feature_engineer = FeatureEngineer(df)

# Add polynomial feature
print("Polynomial feature testing : \n")
df = feature_engineer.add_polynomial_features('Rating', degree=3)
print(df[["Rating_poly_2" , "Rating_poly_3"]])

# Add interaction feature
print("\nInteraction feature testing : \n")
df = feature_engineer.add_interaction_features('Awards', 'Popular')
print(df['Awards_x_Popular'])

Polynomial feature testing : 

    Rating_poly_2  Rating_poly_3
0       95.065926     926.909474
1       62.613614     495.453788
2        9.709784      30.256196
3       77.647519     684.213079
4       27.572905     144.785071
5       75.283150     653.200754
6       84.489537     776.612529
7       89.481731     846.450502
8       83.281426     760.015119
9       85.233470     786.892222
10      79.433513     707.955006
11      73.296843     627.520476
12      59.948052     464.154547
13      94.626571     920.491231
14      47.794870     330.424257
15      68.508715     567.046576
16      57.632135     437.519152
17      92.232555     885.781003
18      96.061165     941.503145
19      85.143820     785.651057
20      61.753977     485.285588
21      29.112847     157.082213
22      69.492619     579.305984
23      52.083329     375.879031
24      88.307388     829.842274
25      97.073748     956.428918
26      82.638728     751.234341
27      92.016887     882.675968
28      32.7