In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, classification_report

import h2o
from h2o.frame import H2OFrame
from h2o.estimators.random_forest import H2ORandomForestEstimator
%matplotlib inline


In [11]:
data = pd.read_csv('Fraud_Data.csv', parse_dates=['signup_time', 'purchase_time'])
data.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [12]:
address2country = pd.read_csv('IpAddress_to_Country.csv')
address2country.head()


Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [16]:
interval_index = pd.IntervalIndex.from_arrays(
    address2country['lower_bound_ip_address'],
    address2country['upper_bound_ip_address'],
    closed='both'
)


# Utiliser cut pour assigner les plages en fonction de l'intervalle, puis map pour obtenir le pays
data['country'] = pd.cut(data['ip_address'], bins=interval_index).map(dict(zip(interval_index, address2country['country'])))

data.head()
    

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States


In [18]:
# some feature engineeringn 

# Get the time difference between purchase time and signup time
data['time_diff'] = (data['purchase_time'] - data['signup_time']).dt.total_seconds()



In [None]:
# Check user number for unique devices
device_num = data['device_id'].value_counts()
data['device_num'] = data['device_id'].map(device_num)


In [21]:
# Check user number for unique ip_address

ip_num = data['ip_address'].value_counts()
data['ip_num'] = data['ip_address'].map(ip_num)
data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,time_diff,device_num,ip_num
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,4506682.0,1,1
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17944.0,1,1
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,1.0,12,12
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,,492085.0,1,1
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,4361461.0,1,1


In [37]:
# Signup day and week
# Purchase day and week

data['signup_day'] = data['signup_time'].dt.dayofweek

data['purchase_day'] = data['purchase_time'].dt.dayofweek

data['signup_week'] = ((data['signup_time'].dt.day_of_year) // 7) 

data['purchase_week'] = ((data['purchase_time'].dt.day_of_year) // 7)



data.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,time_diff,device_num,ip_num,signup_day,purchase_day,signup_week,purchase_week
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,4506682.0,1,1,1,5,7,15
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17944.0,1,1,6,0,22,22
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,1.0,12,12,3,3,0,0
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,,492085.0,1,1,1,0,16,17
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,4361461.0,1,1,1,2,28,36


In [38]:
# Define features and target to be used
#signup_day	signup_week	purchase_day	purchase_week	purchase_value	source	browser	sex	age	country	time_diff	device_num	ip_num	class

features = ['signup_day', 'signup_week', 'purchase_day', 'purchase_week', 'purchase_value', 'source', 'browser' , "sex" , "age" , "country" , "time_diff" , "device_num" , "ip_num", "class"]

mydataframe = data[features]

mydataframe.head()

Unnamed: 0,signup_day,signup_week,purchase_day,purchase_week,purchase_value,source,browser,sex,age,country,time_diff,device_num,ip_num,class
0,1,7,5,15,34,SEO,Chrome,M,39,Japan,4506682.0,1,1,0
1,6,22,0,22,16,Ads,Chrome,F,53,United States,17944.0,1,1,0
2,3,0,3,0,15,SEO,Opera,M,53,United States,1.0,12,12,1
3,1,16,0,17,44,SEO,Safari,M,41,,492085.0,1,1,0
4,1,28,2,36,39,Ads,Safari,M,45,United States,4361461.0,1,1,0


In [47]:
# Split into 70% training and 30% test dataset
# Define features and target

from sklearn.model_selection import train_test_split
# class est la variable cible et derniere   colonne
X,y = mydataframe.iloc[:,:-1], mydataframe.iloc[:,-1] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Build random forest model  drf 
h2o.init()



Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "20.0.1" 2023-04-18; Java(TM) SE Runtime Environment (build 20.0.1+9-29); Java HotSpot(TM) 64-Bit Server VM (build 20.0.1+9-29, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/s_/531btfb97_q2k843yjsr9qm80000gn/T/tmpmf94l04q
  JVM stdout: /var/folders/s_/531btfb97_q2k843yjsr9qm80000gn/T/tmpmf94l04q/h2o_youcefbaleh_started_from_python.out
  JVM stderr: /var/folders/s_/531btfb97_q2k843yjsr9qm80000gn/T/tmpmf94l04q/h2o_youcefbaleh_started_from_python.err


H2OServerError: Server process terminated with error code 1: Exception in thread "main" java.util.ServiceConfigurationError: water.AbstractH2OExtension: Provider hex.tree.xgboost.XGBoostExtension could not be instantiated
	at java.base/java.util.ServiceLoader.fail(ServiceLoader.java:586)
	at java.base/java.util.ServiceLoader$ProviderImpl.newInstance(ServiceLoader.java:813)
	at java.base/java.util.ServiceLoader$ProviderImpl.get(ServiceLoader.java:729)
	at java.base/java.util.ServiceLoader$3.next(ServiceLoader.java:1403)
	at water.ExtensionManager.registerCoreExtensions(ExtensionManager.java:105)
	at water.H2O.main(H2O.java:2357)
	at water.H2OStarter.start(H2OStarter.java:22)
	at water.H2OStarter.start(H2OStarter.java:51)
	at water.H2OApp.main(H2OApp.java:12)
Caused by: java.util.ServiceConfigurationError: java.net.spi.InetAddressResolverProvider: Provider org.xbill.DNS.spi.DnsjavaInetAddressResolverProvider not found
	at java.base/java.util.ServiceLoader.fail(ServiceLoader.java:593)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.nextProviderClass(ServiceLoader.java:1219)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNextService(ServiceLoader.java:1228)
	at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNext(ServiceLoader.java:1273)
	at java.base/java.util.ServiceLoader$2.hasNext(ServiceLoader.java:1309)
	at java.base/java.util.ServiceLoader$3.hasNext(ServiceLoader.java:1393)
	at java.base/java.util.ServiceLoader.findFirst(ServiceLoader.java:1812)
	at java.base/java.net.InetAddress.loadResolver(InetAddress.java:480)
	at java.base/java.net.InetAddress.resolver(InetAddress.java:460)
	at java.base/java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1681)
	at java.base/java.net.InetAddress$NameServiceAddresses.get(InetAddress.java:1004)
	at java.base/java.net.InetAddress.getAllByName0(InetAddress.java:1673)
	at java.base/java.net.InetAddress.getLocalHost(InetAddress.java:1786)
	at org.apache.logging.log4j.core.util.NetUtils.getLocalHostname(NetUtils.java:54)
	at org.apache.logging.log4j.core.LoggerContext.lambda$setConfiguration$0(LoggerContext.java:620)
	at java.base/java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1708)
	at org.apache.logging.log4j.core.LoggerContext.setConfiguration(LoggerContext.java:620)
	at org.apache.logging.log4j.core.LoggerContext.reconfigure(LoggerContext.java:699)
	at org.apache.logging.log4j.core.LoggerContext.reconfigure(LoggerContext.java:716)
	at org.apache.logging.log4j.core.LoggerContext.start(LoggerContext.java:270)
	at org.apache.logging.log4j.core.impl.Log4jContextFactory.getContext(Log4jContextFactory.java:155)
	at org.apache.logging.log4j.core.impl.Log4jContextFactory.getContext(Log4jContextFactory.java:47)
	at org.apache.logging.log4j.LogManager.getContext(LogManager.java:309)
	at org.apache.log4j.Logger$PrivateManager.getContext(Logger.java:59)
	at org.apache.log4j.Logger.getLogger(Logger.java:41)
	at hex.tree.xgboost.XGBoostExtension.<clinit>(XGBoostExtension.java:24)
	at java.base/jdk.internal.misc.Unsafe.ensureClassInitialized0(Native Method)
	at java.base/jdk.internal.misc.Unsafe.ensureClassInitialized(Unsafe.java:1160)
	at java.base/jdk.internal.reflect.MethodHandleAccessorFactory.ensureClassInitialized(MethodHandleAccessorFactory.java:300)
	at java.base/jdk.internal.reflect.MethodHandleAccessorFactory.newConstructorAccessor(MethodHandleAccessorFactory.java:103)
	at java.base/jdk.internal.reflect.ReflectionFactory.newConstructorAccessor(ReflectionFactory.java:200)
	at java.base/java.lang.reflect.Constructor.acquireConstructorAccessor(Constructor.java:547)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:497)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:484)
	at java.base/java.util.ServiceLoader$ProviderImpl.newInstance(ServiceLoader.java:789)
	... 7 more
