## Data

In [28]:
import pandas as pd

In [29]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [30]:
train_df.iloc[0].to_dict()

{'transaction_time': '2019-12-27 15:21',
 'merch': 'fraud_Cormier LLC',
 'cat_id': 'health_fitness',
 'amount': 148.04,
 'name_1': 'Daniel',
 'name_2': 'Martinez',
 'gender': 'M',
 'street': '8510 Acevedo Burgs',
 'one_city': 'Kent',
 'us_state': 'OR',
 'post_code': 97033,
 'lat': 45.0838,
 'lon': -120.6649,
 'population_city': 60,
 'jobs': 'Museum education officer',
 'merchant_lat': 45.042827,
 'merchant_lon': -120.709327,
 'target': 0}

In [31]:
train_df.dtypes

transaction_time     object
merch                object
cat_id               object
amount              float64
name_1               object
name_2               object
gender               object
street               object
one_city             object
us_state             object
post_code             int64
lat                 float64
lon                 float64
population_city       int64
jobs                 object
merchant_lat        float64
merchant_lon        float64
target                int64
dtype: object

In [32]:
train_df.target.value_counts()

target
0    781927
1      4504
Name: count, dtype: int64

## Pre-processing + logreg

In [33]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

def add_features(df):
    df = df.copy()
    dt = pd.to_datetime(df['transaction_time'])
    df['hour'] = dt.dt.hour
    df['dow'] = dt.dt.dayofweek

    def haversine(lat1, lon1, lat2, lon2):
        R = 6371.0
        p1 = np.radians(lat1)
        p2 = np.radians(lat2)
        dphi = np.radians(lat2 - lat1)
        dlambda = np.radians(lon2 - lon1)
        a = np.sin(dphi / 2) ** 2 + np.cos(p1) * np.cos(p2) * np.sin(dlambda / 2) ** 2
        return 2 * R * np.arcsin(np.sqrt(a))

    df['distance_km'] = haversine(df['lat'], df['lon'], df['merchant_lat'], df['merchant_lon'])
    return df

def make_Xy(df):
    df = add_features(df)
    y = df['target'].values if 'target' in df.columns else None
    drop_cols = [
        'transaction_time', 'name_1', 'name_2', 'street', 'one_city',
        'jobs', 'merch', 'post_code', 'target'
    ]
    keep = [c for c in df.columns if c not in drop_cols]
    X = df[keep]
    return X, y

def build_logreg_pipeline():
    num_features = [
        'amount','lat','lon','merchant_lat','merchant_lon',
        'population_city','hour','dow','distance_km'
    ]
    cat_features = ['cat_id','us_state','gender']

    num_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    ])

    cat_pipe = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    prep = ColumnTransformer([
        ('num', num_pipe, num_features),
        ('cat', cat_pipe, cat_features)
    ])

    model = Pipeline([
        ('prep', prep),
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
    ])
    return model



In [34]:
X, y = make_Xy(train_df)
pipe = build_logreg_pipeline()
pipe.fit(X, y)

## Make submission

In [35]:
ss = pd.read_csv('submissions/sample_submition.csv')
ss

Unnamed: 0,index,prediction
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
262139,262139,0
262140,262140,0
262141,262141,0
262142,262142,0


In [36]:
ss.prediction.value_counts()

prediction
0    262144
Name: count, dtype: int64

In [37]:
new_X, _ = make_Xy(test_df)
labels = pipe.predict(new_X).astype(int)

submission = pd.DataFrame({
    'index': test_df.index,
    'prediction': labels
})

submission.to_csv('submissions/submission.csv', index=False)

In [38]:
submission.prediction.value_counts()

prediction
0    229777
1     32367
Name: count, dtype: int64

sumbission scored 0.067 on kaggle so i guess it is good enough