# Import et présentation

In [1]:
# TP de machine learning


import seaborn as sns
import pandas as pd
import warnings

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import metrics

sns.set

labelencoder = preprocessing.LabelEncoder()

# Chargement des datasets
ipAddress = pd.read_csv(r"/content/IpAddress_to_Country.csv")
fraud = pd.read_csv(r"/content/Fraud_Data.csv")
ipAddress_df = ipAddress
fraud_df = fraud

In [3]:
# Observation de notre target. Colonne : class
# Sur les colonnes, nombreuses sont des variables qualitatives. À transformer si on souhaite les garder
fraud_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,0
...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3.451155e+09,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2.439047e+09,0
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2.748471e+09,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3.601175e+09,0


In [5]:
# Lot d'adresses IP avec plage basse et plage haute, relié à Country
ipAddress_df

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,1.677722e+07,16777471,Australia
1,1.677747e+07,16777727,China
2,1.677773e+07,16778239,China
3,1.677824e+07,16779263,Australia
4,1.677926e+07,16781311,China
...,...,...,...
138841,3.758092e+09,3758093311,Hong Kong
138842,3.758093e+09,3758094335,India
138843,3.758095e+09,3758095871,China
138844,3.758096e+09,3758096127,Singapore



# DATA WRANGLING

In [6]:
# Nettoyage de tous les valeurs "NA" dans les datasets afin de supprimer les exemples erronées
fraud_df = fraud_df.dropna()
ipAddress_df = ipAddress_df.dropna()

In [7]:
# Transformation des colonnes qualitatives en données quantitatives avec la méthode : "labelencoder" importé précedemment
# Cette méthode permet de trouver toutes lesvvaleurs présentes dans une colonne et d'associer une valeur numérique à la place.
# Transformation d'une donnée que l'ordinateur comprend 
# Exemple : Si deux valeurs sont identiques elles prennent le même nombre. "Paris" = 1.

fraud_df['source'] = labelencoder.fit_transform(fraud_df['source'])
fraud_df['browser'] = labelencoder.fit_transform(fraud_df['browser'])
fraud_df['device_id'] = labelencoder.fit_transform(fraud_df['device_id'])


# Suppression de "signup_time" et "purchase_time" car ce sont des colonnes de date
# Une fraude peut arriver à n'importe quelle heure via des scripts/tâches planifiées

fraud_df = fraud_df.drop(['signup_time','purchase_time'], axis= 1)


# Colonne "sex", des valeurs transformées en booléen.
# Utilisation de la méthode cat.codes

fraud_df.sex = fraud_df['sex'].astype('category').cat.codes

# Affichage
fraud_df, ipAddress_df

(        user_id  purchase_value  device_id  ...  age    ip_address  class
 0         22058              34      89215  ...   39  7.327584e+08      0
 1        333320              16      24078  ...   53  3.503114e+08      0
 2          1359              15     131216  ...   53  2.621474e+09      1
 3        150084              44       3977  ...   41  3.840542e+09      0
 4        221365              39      68757  ...   45  4.155831e+08      0
 ...         ...             ...        ...  ...  ...           ...    ...
 151107   345170              43     125335  ...   28  3.451155e+09      1
 151108   274471              35      63001  ...   32  2.439047e+09      0
 151109   368416              40      64204  ...   26  2.748471e+09      0
 151110   207709              46      13118  ...   37  3.601175e+09      0
 151111   138208              20     134439  ...   38  4.103825e+09      0
 
 [151112 rows x 9 columns],
         lower_bound_ip_address  upper_bound_ip_address    country
 0 

# MERGING DES DEUX DATASETS

In [9]:
# Objectif : relier les adresses IP des fraudeurs à des plages d'adresses IP (pour le relier au pays)

# Merge les deux dataframes. Prise de chaque adresse IP et analyser sur quelle plage d'IP elle se trouve
# Dès qu'on trouve la plage, on associe country à la position

def IP_to_country(ip) :
  try :
    return ipAddress_df.country[(ipAddress_df.lower_bound_ip_address <= ip) & (ipAddress_df.upper_bound_ip_address >= ip)].iloc[0]
  except IndexError :
    return "Unknown"

fraud_df["IP_country"] = fraud_df.ip_address.apply(IP_to_country)

# On remarque que notre nouveau dataFrame fraud_df possède désormais la colonne "IP_country".
fraud_df

Unnamed: 0,user_id,purchase_value,device_id,source,browser,sex,age,ip_address,class,IP_country
0,22058,34,89215,2,0,1,39,7.327584e+08,0,Japan
1,333320,16,24078,0,0,0,53,3.503114e+08,0,United States
2,1359,15,131216,2,3,1,53,2.621474e+09,1,United States
3,150084,44,3977,2,4,1,41,3.840542e+09,0,Unknown
4,221365,39,68757,0,4,1,45,4.155831e+08,0,United States
...,...,...,...,...,...,...,...,...,...,...
151107,345170,43,125335,2,0,1,28,3.451155e+09,1,United States
151108,274471,35,63001,2,4,1,32,2.439047e+09,0,Netherlands
151109,368416,40,64204,2,2,0,26,2.748471e+09,0,Japan
151110,207709,46,13118,2,0,1,37,3.601175e+09,0,United States


# DATA CLEANING

In [10]:
# Des données possèdent une valeur appelée "Unknown" car l'adresse IP ne faisait partie d'aucune plage de données
# Il faut donc les supprimer

fraud_df = fraud_df[fraud_df.IP_country != 'Unknown']
fraud_df

Unnamed: 0,user_id,purchase_value,device_id,source,browser,sex,age,ip_address,class,IP_country
0,22058,34,89215,2,0,1,39,7.327584e+08,0,Japan
1,333320,16,24078,0,0,0,53,3.503114e+08,0,United States
2,1359,15,131216,2,3,1,53,2.621474e+09,1,United States
4,221365,39,68757,0,4,1,45,4.155831e+08,0,United States
5,159135,42,2322,0,0,1,18,2.809315e+09,0,Canada
...,...,...,...,...,...,...,...,...,...,...
151106,360761,13,22251,2,4,0,42,8.740657e+08,0,United States
151107,345170,43,125335,2,0,1,28,3.451155e+09,1,United States
151108,274471,35,63001,2,4,1,32,2.439047e+09,0,Netherlands
151109,368416,40,64204,2,2,0,26,2.748471e+09,0,Japan


In [11]:
# Isolation de la target "class" et affichage

Y = fraud_df['class']
Y

0         0
1         0
2         1
4         0
5         0
         ..
151106    0
151107    1
151108    0
151109    0
151110    0
Name: class, Length: 129146, dtype: int64

In [12]:
# Une fois le traitement effectué et les features isolés, plus d'utilité d'avoir l'adresse IP et la target

X = fraud_df.drop(['ip_address','class'], axis = 1)
X

Unnamed: 0,user_id,purchase_value,device_id,source,browser,sex,age,IP_country
0,22058,34,89215,2,0,1,39,Japan
1,333320,16,24078,0,0,0,53,United States
2,1359,15,131216,2,3,1,53,United States
4,221365,39,68757,0,4,1,45,United States
5,159135,42,2322,0,0,1,18,Canada
...,...,...,...,...,...,...,...,...
151106,360761,13,22251,2,4,0,42,United States
151107,345170,43,125335,2,0,1,28,United States
151108,274471,35,63001,2,4,1,32,Netherlands
151109,368416,40,64204,2,2,0,26,Japan


In [13]:
# La nouvelle colonne IP_country est en format String
# Transformation des autres variables qualitatives en variables quantitatives (comme au début)

X['IP_country'] = labelencoder.fit_transform(X['IP_country'])
X

Unnamed: 0,user_id,purchase_value,device_id,source,browser,sex,age,IP_country
0,22058,34,89215,2,0,1,39,84
1,333320,16,24078,0,0,0,53,171
2,1359,15,131216,2,3,1,53,171
4,221365,39,68757,0,4,1,45,171
5,159135,42,2322,0,0,1,18,32
...,...,...,...,...,...,...,...,...
151106,360761,13,22251,2,4,0,42,171
151107,345170,43,125335,2,0,1,28,171
151108,274471,35,63001,2,4,1,32,118
151109,368416,40,64204,2,2,0,26,84
