In [354]:
import numpy as np

In [355]:
from sklearn.metrics import confusion_matrix

In [356]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [357]:
import pandas as pd
from zipfile import ZipFile

def load_zip_to_pd(filename):
    #returns dataframe of zipped JSON file
    zip = ZipFile(filename)
    zip.extractall()

    return pd.read_json('data.{}'.format('json'))

In [358]:
from HTMLParser import HTMLParser
import re

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return re.sub('[^A-Za-z]+', ' ', s.get_data())

In [359]:
#load data and add Fraud column
df = load_zip_to_pd('data.zip')

df["Fraud"] = ((df['acct_type'] == 'fraudster') 
               | (df['acct_type'] == 'fraudster_event') 
               | (df['acct_type'] == 'fraudster_att')).astype(int)

In [407]:
# MultinomialNB classifier using text from description
docs_train, docs_test, y_train, y_test = train_test_split(df['description'], df['Fraud'])
vect = TfidfVectorizer(stop_words='english', preprocessor=strip_tags, analyzer='word', max_df=.5)
X_train = vect.fit_transform(docs_train)
X_test = vect.transform(docs_test)
mnnb = MultinomialNB(alpha=.01)
mnnb.fit(X_train, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [361]:
tn, fp, fn, tp = confusion_matrix(y_test, mnnb.predict(X_test)).ravel()
print "MN naive recall:", float(tp) / (tp + fn)
print "MN naive precision:", float(tp) / (tp + fp)
print "Accuracy on MN naive bayes:", mnnb.score(X_test, y_test)

MN naive recall: 0.227963525836
MN naive precision: 0.609756097561
Accuracy on MN naive bayes: 0.915760111576


In [347]:
# train_probas = mnnb.predict_proba(X_train)
# test_probas = mnnb.predict_proba(X_test)

In [362]:
X = vect.fit_transform(df['description'])
y = df['Fraud']

mnnb = MultinomialNB(alpha=.01)
mnnb.fit(X, y)
probas = mnnb.predict_proba(X)[:, 1]

In [363]:
df['NB_probas'] = probas

In [379]:
[str(x) for x  in df_num.columns]

['approx_payout_date',
 'body_length',
 'channels',
 'delivery_method',
 'event_created',
 'event_end',
 'event_published',
 'event_start',
 'fb_published',
 'gts',
 'has_analytics',
 'has_header',
 'has_logo',
 'name_length',
 'num_order',
 'num_payouts',
 'object_id',
 'org_facebook',
 'org_twitter',
 'sale_duration',
 'sale_duration2',
 'show_map',
 'user_age',
 'user_created',
 'user_type',
 'venue_latitude',
 'venue_longitude',
 'Fraud',
 'NB_probas']

In [391]:
df_num = df.select_dtypes(include=[np.number])
df_num.fillna(value=0, inplace=True)

X = df_num.drop(['Fraud'], axis=1)
y = df_num['Fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)


rf_classifier = RandomForestClassifier(n_estimators=40, oob_score=True).fit(X_train, y_train)


confusion_matrix(y_test, rf_classifier.predict(X_test))

tn, fp, fn, tp = confusion_matrix(y_test, rf_classifier.predict(X_test)).ravel()
print "RF Accuracy", rf_classifier.score(X_test, y_test)
print "RF Recall:", float(tp) / (tp + fn)
print "RF Precision:", float(tp) / (tp + fp)
print "confusion matrix:"
print tp, fp
print fn, tn

RF Accuracy 0.990237099024
RF Recall: 0.936813186813
RF Precision: 0.966005665722
confusion matrix:
341 12
23 3209


In [406]:
X_test

Unnamed: 0,approx_payout_date,body_length,channels,delivery_method,event_created,event_end,event_published,event_start,fb_published,gts,...,org_twitter,sale_duration,sale_duration2,show_map,user_age,user_created,user_type,venue_latitude,venue_longitude,NB_probas
3726,1350081000,956,5,0.0,1346328863,1349649000,1.346751e+09,1349645400,0,297.28,...,0.0,33.0,38,1,9,1345548020,3,50.835052,-0.132263,1.940887e-04
7589,1327626000,28270,0,1.0,1322675033,1327194000,1.323636e+09,1327114800,0,3959.71,...,0.0,40.0,51,1,471,1281977392,4,44.085113,-123.042168,6.442459e-05
1143,1346351400,708,6,1.0,1333023839,1345919400,1.333025e+09,1345908600,1,3281.64,...,0.0,149.0,149,0,440,1295042176,4,41.450653,-85.281456,3.162852e-04
9644,1331274600,1939,0,1.0,1329914911,1330842600,1.329916e+09,1330828200,0,206.42,...,0.0,10.0,10,1,21,1328104625,3,38.859563,-77.069172,6.681611e-05
11750,1351139400,5695,8,0.0,1349368824,1350707400,1.349372e+09,1350696600,0,120.00,...,15.0,15.0,15,1,210,1331251273,3,34.054641,-118.255311,7.771610e-05
13420,1377810000,1334,11,1.0,1370550957,1377378000,1.370554e+09,1377367200,0,1653.63,...,16.0,79.0,79,1,0,1370550955,1,53.566452,-113.504498,6.383782e-04
13794,1359488700,870,11,0.0,1357680151,1359056700,1.357682e+09,1359046800,0,303.66,...,0.0,16.0,16,1,672,1299617210,1,39.746518,-104.998367,9.948115e-01
4714,1349492400,171,6,0.0,1347029839,1349060400,1.347031e+09,1348959600,0,355.81,...,0.0,22.0,22,1,136,1335280245,3,39.568484,-104.960454,9.018623e-02
1704,1348822800,11508,5,1.0,1346277503,1348390800,1.346279e+09,1348376400,0,2332.65,...,0.0,24.0,24,0,1146,1247254963,3,37.785551,-122.408323,7.606497e-05
4514,1295046000,1090,8,0.0,1293671757,1294614000,1.293730e+09,1294606800,0,45.00,...,0.0,10.0,11,0,678,1235103192,4,42.342379,-71.119574,8.376273e-05


In [380]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
print scores.mean()

0.9893045744


In [196]:
# log_reg recall: 0.0849673202614
# Accuracy on test set: 0.92189679219

In [83]:
print df[df['Fraud'] == 1]['user_age'].mean()

print df[df['Fraud'] == 0]['user_age'].mean()

87.1523588554
402.683072677


In [84]:
print df[df['Fraud'] == 1]['body_length'].mean()

print df[df['Fraud'] == 0]['body_length'].mean()

1508.8863109
3886.99455688


In [85]:
print df[df['Fraud'] == 1]['currency'].value_counts()

print df[df['Fraud'] == 0]['currency'].value_counts()

USD    858
GBP    343
CAD     42
EUR     37
AUD     12
MXN      1
Name: currency, dtype: int64
USD    8838
GBP    1879
CAD    1280
AUD     767
EUR     225
NZD      55
Name: currency, dtype: int64


In [394]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14337 entries, 0 to 14336
Data columns (total 46 columns):
acct_type             14337 non-null object
approx_payout_date    14337 non-null int64
body_length           14337 non-null int64
channels              14337 non-null int64
country               14256 non-null object
currency              14337 non-null object
delivery_method       14321 non-null float64
description           14337 non-null object
email_domain          14337 non-null object
event_created         14337 non-null int64
event_end             14337 non-null int64
event_published       14238 non-null float64
event_start           14337 non-null int64
fb_published          14337 non-null int64
gts                   14337 non-null float64
has_analytics         14337 non-null int64
has_header            8928 non-null float64
has_logo              14337 non-null int64
listed                14337 non-null object
name                  14337 non-null object
name_length      