## Data

In [45]:
import pandas as pd

In [46]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [47]:
train_df.iloc[0].to_dict()

{'transaction_time': '2019-12-27 15:21',
 'merch': 'fraud_Cormier LLC',
 'cat_id': 'health_fitness',
 'amount': 148.04,
 'name_1': 'Daniel',
 'name_2': 'Martinez',
 'gender': 'M',
 'street': '8510 Acevedo Burgs',
 'one_city': 'Kent',
 'us_state': 'OR',
 'post_code': 97033,
 'lat': 45.0838,
 'lon': -120.6649,
 'population_city': 60,
 'jobs': 'Museum education officer',
 'merchant_lat': 45.042827,
 'merchant_lon': -120.709327,
 'target': 0}

In [48]:
train_df.dtypes

transaction_time     object
merch                object
cat_id               object
amount              float64
name_1               object
name_2               object
gender               object
street               object
one_city             object
us_state             object
post_code             int64
lat                 float64
lon                 float64
population_city       int64
jobs                 object
merchant_lat        float64
merchant_lon        float64
target                int64
dtype: object

In [49]:
train_df.target.value_counts()

target
0    781927
1      4504
Name: count, dtype: int64

## Pre-processing + logreg

In [50]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

def add_features(df):
    df = df.copy()
    dt = pd.to_datetime(df['transaction_time'])
    df['hour'] = dt.dt.hour
    df['dow'] = dt.dt.dayofweek

    def haversine(lat1, lon1, lat2, lon2):
        R = 6371.0
        p1 = np.radians(lat1)
        p2 = np.radians(lat2)
        dphi = np.radians(lat2 - lat1)
        dlambda = np.radians(lon2 - lon1)
        a = np.sin(dphi / 2) ** 2 + np.cos(p1) * np.cos(p2) * np.sin(dlambda / 2) ** 2
        return 2 * R * np.arcsin(np.sqrt(a))

    df['distance_km'] = haversine(df['lat'], df['lon'], df['merchant_lat'], df['merchant_lon'])
    return df

def make_Xy(df):
    df = add_features(df)
    y = df['target'].values if 'target' in df.columns else None
    drop_cols = [
        'transaction_time','name_1','name_2','street','one_city',
        'jobs','merch','post_code','target'
    ]
    keep = [c for c in df.columns if c not in drop_cols]
    X = df[keep]
    return X, y

X, y = make_Xy(train_df)

num_features = [
    'amount','lat','lon','merchant_lat','merchant_lon',
    'population_city','hour','dow','distance_km'
]
cat_features = ['cat_id','us_state','gender']

num_imputer = SimpleImputer(strategy='median')
num_scaler = StandardScaler()
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_num = num_scaler.fit_transform(num_imputer.fit_transform(X[num_features]))
X_cat = cat_encoder.fit_transform(cat_imputer.fit_transform(X[cat_features]))
X_final = np.hstack([X_num, X_cat])

clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_final, y)

joblib.dump(clf, '../models/logreg_model.joblib')
joblib.dump(num_imputer, '../preprocessors/num_imputer.joblib')
joblib.dump(num_scaler, '../preprocessors/num_scaler.joblib')
joblib.dump(cat_imputer, '../preprocessors/cat_imputer.joblib')
joblib.dump(cat_encoder, '../preprocessors/cat_encoder.joblib')


['../preprocessors/cat_encoder.joblib']

## Make submission

In [53]:
def make_X(df):
    df = add_features(df)
    drop_cols = [
        'transaction_time','name_1','name_2','street','one_city',
        'jobs','merch','post_code','target'
    ]
    keep = [c for c in df.columns if c not in drop_cols]
    X = df[keep]
    return X

In [54]:
test_df = pd.read_csv('data/test.csv')
X = make_X(test_df)

clf = joblib.load('../models/logreg_model.joblib')
num_imputer = joblib.load('../preprocessors/num_imputer.joblib')
num_scaler = joblib.load('../preprocessors/num_scaler.joblib')
cat_imputer = joblib.load('../preprocessors/cat_imputer.joblib')
cat_encoder = joblib.load('../preprocessors/cat_encoder.joblib')

X_num = num_scaler.transform(num_imputer.transform(X[num_features]))
X_cat = cat_encoder.transform(cat_imputer.transform(X[cat_features]))
X_final = np.hstack([X_num, X_cat])

labels = clf.predict(X_final).astype(int)

submission = pd.DataFrame({
    'index': test_df.index,
    'prediction': labels
})

submission.to_csv('submissions/sample_submission.csv', index=False)

sumbission scored 0.067 on kaggle so i guess it is good enough