In [None]:
import pandas as pd

def convert_arff_to_csv(arff_filepath, csv_filepath):

    data_started = False
    data_lines = []
    attributes = []

    with open(arff_filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('@ATTRIBUTE'):
                # Extract attribute names
                parts = line.split()
                attributes.append(parts[1])
            elif line == '@DATA':
                data_started = True
            elif data_started and line:
                data_lines.append(line.split(','))

    if not data_started:
        raise ValueError("No '@DATA' section found in the ARFF file.")

    df = pd.DataFrame(data_lines, columns=attributes)

    for col in attributes:
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError:
            pass

    df.to_csv(csv_filepath, index=False)
    print(f"Successfully converted '{arff_filepath}' to '{csv_filepath}'")


In [None]:
arff_input_file = '/content/drive/MyDrive/Case Studies/PayPal/datasets/dataset_'
csv_output_file = '/content/drive/MyDrive/Case Studies/PayPal/datasets/card_transdata_cleaned.csv'

try:
    convert_arff_to_csv(arff_input_file, csv_output_file)

    df_cleaned = pd.read_csv(csv_output_file)
    print("\nFirst 5 rows of the cleaned data:")
    print(df_cleaned.head())
    print("\nData types of the cleaned data:")
    print(df_cleaned.info())
except FileNotFoundError:
    print(f"Error: The file '{arff_input_file}' was not found.")
except ValueError as e:
    print(f"Error processing ARFF file: {e}")

Successfully converted '/content/drive/MyDrive/Case Studies/PayPal/datasets/dataset_' to '/content/drive/MyDrive/Case Studies/PayPal/datasets/card_transdata_cleaned.csv'

First 5 rows of the cleaned data:
   distance_from_home  distance_from_last_transaction  \
0           57.877857                        0.311140   
1           10.829943                        0.175592   
2            5.091079                        0.805153   
3            2.247564                        5.600044   
4           44.190936                        0.566486   

   ratio_to_median_purchase_price  repeat_retailer  used_chip  \
0                        1.945940              1.0        1.0   
1                        1.294219              1.0        0.0   
2                        0.427715              1.0        0.0   
3                        0.362663              1.0        1.0   
4                        2.222767              1.0        1.0   

   used_pin_number  online_order  fraud  
0              0.0 