In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid" , palette="muted")


In [2]:
df=pd.read_csv("audiobook_data_2.csv")
df.rename(columns={"Unnamed:0": "row_id"}, inplace=True)
print("Original Shape:",df.shape)

Original Shape: (14084, 12)


In [3]:
print("Missing values:\n",df.isna().sum())
df.dropna(inplace=True)
print("Shape after dropping missing values:",df.shape)

Missing values:
 Unnamed: 0                         0
Book_length(mins)_overall          0
Book_length(mins)_avg              0
Price_overall                      0
Price_avg                          0
Review                             0
Review10/10                        0
Completion                         0
Minutes_listened                   0
Support_Request                    0
Last_Visited_mins_Purchase_date    0
Target                             0
dtype: int64
Shape after dropping missing values: (14084, 12)


In [4]:
print("Duplicated rows:",df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Shape after dropping duplicates:",df.shape)

Duplicated rows: 0
Shape after dropping duplicates: (14084, 12)


In [6]:
numeric_cols=df.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_cols:
    if col not in ["row_id","Target"]:
        q1=df[col].quantile(0.25)
        q3=df[col].quantile(0.75)
        iqr=q3-q1
        lower=q1-1.5*iqr
        upper=q3+1.5*iqr
        df[col]=np.where(df[col]< lower , lower, np.where(df[col] > upper,upper,df[col]))
print("Outliers handled(using IQR).")        

Outliers handled(using IQR).


In [7]:
df["Listening_Efficiency"] = df["Minutes_listened"]/(df["Book_length(mins)_overall"]+1)
df["Value_for_money"] = df["Minutes_listened"] / (df["Price_overall"]+1)
print("New features created:")
print("Listening_Efficiency")
print("Value_for_money")

New features created:
Listening_Efficiency
Value_for_money


In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaled_cols=["Minutes_listened","Book_length(mins)_overall","Price_overall","Completion","Last_Visited_mins_Purchase_date"]
df[scaled_cols]=scaler.fit_transform(df[scaled_cols])
print("Selected columns scaled between 0-1")

Selected columns scaled between 0-1


In [11]:
df.to_csv("audiobook_data_cleaned.csv", index=False)
print("Cleaned dataset saved as audiobook_data_cleaned.csv")
df.head()


Cleaned dataset saved as audiobook_data_cleaned.csv


Unnamed: 0.1,Unnamed: 0,Book_length(mins)_overall,Book_length(mins)_avg,Price_overall,Price_avg,Review,Review10/10,Completion,Minutes_listened,Support_Request,Last_Visited_mins_Purchase_date,Target,Listening_Efficiency,Value_for_money
0,994.0,0.722222,1620.0,1.0,12.005,0.0,8.91,1.0,1.0,0.0,0.350476,0,0.299815,37.370242
1,1143.0,1.0,2160.0,0.180479,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0,0.0,0.0
2,2059.0,1.0,2160.0,0.180479,5.33,0.0,8.91,0.0,0.0,0.0,1.0,0,0.0,0.0
3,2882.0,0.722222,1620.0,0.257827,5.96,0.0,8.91,1.0,1.0,0.0,0.491429,0,0.299815,69.827586
4,3342.0,1.0,2160.0,0.180479,5.33,0.0,8.91,0.676923,0.977778,0.0,1.0,0,0.219898,75.07109
