In [1]:
#import pandas and numpy

In [2]:
import pandas as pd
import numpy as np

In [3]:
#load dataset

In [4]:
df_transactions = pd.read_csv('customer_transactions_contaminated.csv')

In [5]:
# Check for initial duplicate rows

In [6]:
initial_duplicates = df_transactions.duplicated().sum()
print(f'Initial duplicate count: {initial_duplicates}')

Initial duplicate count: 185


In [7]:
# Remove duplicate rows

In [8]:
df_cleaned_transactions = df_transactions.drop_duplicates()

In [9]:
# Verify duplicate count after dropping

In [10]:
duplicates_after = df_cleaned_transactions.duplicated().sum()
print(f'Duplicate count after dropping: {duplicates_after}')

Duplicate count after dropping: 0


In [11]:
# Count null values in each column

In [12]:
null_counts = df_cleaned_transactions.isnull().sum()
print('Null values in each column:')
print(null_counts)

Null values in each column:
CustomerID           0
TransactionID        0
TransactionDate      0
Amount             283
ProductCategory    282
PaymentMethod        0
dtype: int64


In [13]:
# Drop all rows with any null values

In [14]:
df_cleaned_transactions = df_cleaned_transactions.dropna()

In [15]:
# Verify that there are no more null values

In [16]:
null_counts_after = df_cleaned_transactions.isnull().sum()
print('Null values in each column after dropping:')
print(null_counts_after)

Null values in each column after dropping:
CustomerID         0
TransactionID      0
TransactionDate    0
Amount             0
ProductCategory    0
PaymentMethod      0
dtype: int64


In [17]:
# Check data types

In [18]:
print('Data types before conversion:')
print(df_cleaned_transactions.dtypes)

Data types before conversion:
CustomerID         object
TransactionID      object
TransactionDate    object
Amount             object
ProductCategory    object
PaymentMethod      object
dtype: object


In [19]:
# Convert 'TransactionDate' to datetime

In [20]:
df_cleaned_transactions['TransactionDate'] = pd.to_datetime(df_cleaned_transactions['TransactionDate'], format='%d/%m/%Y', errors='coerce')

In [21]:
# Convert 'PaymentMethod' and 'ProductCategory' to category

In [22]:
df_cleaned_transactions['PaymentMethod'] = df_cleaned_transactions['PaymentMethod'].astype('category')
df_cleaned_transactions['ProductCategory'] = df_cleaned_transactions['ProductCategory'].astype('category')

In [25]:
# Replace 'Free' with 0 in the Amount column

In [26]:
df_cleaned_transactions['Amount'] = df_cleaned_transactions['Amount'].replace('Free', 0)

In [27]:
# Convert Amount to float

In [28]:
df_cleaned_transactions['Amount'] = df_cleaned_transactions['Amount'].astype(float)

In [29]:
# Verify data types after conversion

In [30]:
print('Data types after conversion:')
print(df_cleaned_transactions.dtypes)

Data types after conversion:
CustomerID                 object
TransactionID              object
TransactionDate    datetime64[ns]
Amount                    float64
ProductCategory          category
PaymentMethod            category
dtype: object


In [31]:
#Save the cleaned DataFrame

In [32]:
df_cleaned_transactions.to_csv('customer_transactions_cleaned.csv', index=False)