In [1]:
import pandas as pd

In [2]:
transactions = pd.read_csv('../tests/test_data/test_transactions.csv')

### Transform and clean the transactions data

#### Remove missing values

In [3]:
# remove rows with null values in transaction from the transaction dataframe
transactions = transactions.dropna(subset=['transaction_date'])

# remove rows with null values in amount from the transaction dataframe
transactions = transactions.dropna(subset=['amount'])

# See information about the transactions dataframe
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10246 entries, 0 to 10499
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   transaction_id    10246 non-null  int64 
 1   customer_id       10246 non-null  int64 
 2   transaction_date  10246 non-null  object
 3   amount            10246 non-null  object
dtypes: int64(2), object(2)
memory usage: 400.2+ KB


#### Remove invalid values

In [4]:
# Remove rows with the value of "INVALID" in the amount column
transactions = transactions[transactions['amount'] != 'INVALID']

#### Standardise the date format

In [5]:
# Convert all dates into dd/mm/yyyy format - write function to handle the different types of date formats
def standardise_date(date_str):
    if pd.isna(date_str) or date_str == '':
        return pd.NaT

    formats = [
        '%Y/%m/%d', '%Y-%m-%d', '%d %b %Y', '%b %d, %Y', '%d %B %Y',
        '%d-%m-%Y', '%d/%m/%Y'
    ]
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue

    return pd.NaT


# Apply the parse_date function to the transaction_date column
transactions['transaction_date'] = (
    transactions['transaction_date'].apply(standardise_date)
)
transactions['transaction_date'] = (
    transactions['transaction_date'].dt.strftime('%d/%m/%Y')
)

transactions = transactions.dropna(subset=['transaction_date'])

# Display the DataFrame info
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8977 entries, 0 to 10499
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   transaction_id    8977 non-null   int64 
 1   customer_id       8977 non-null   int64 
 2   transaction_date  8977 non-null   object
 3   amount            8977 non-null   object
dtypes: int64(2), object(2)
memory usage: 350.7+ KB


#### Remove any duplicates after cleaning and transforming

In [6]:
# Drop duplicates
transactions.drop_duplicates(inplace=True)

transactions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8534 entries, 0 to 9981
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   transaction_id    8534 non-null   int64 
 1   customer_id       8534 non-null   int64 
 2   transaction_date  8534 non-null   object
 3   amount            8534 non-null   object
dtypes: int64(2), object(2)
memory usage: 333.4+ KB


In [7]:
# Set the amount column to a float data type
transactions['amount'] = transactions['amount'].astype(float)

In [8]:
# export to results file
transactions.to_csv('../tests/test_data/expected_transactions_clean_results.csv', index=False)