In [354]:
import numpy as np

In [355]:
from sklearn.metrics import confusion_matrix

In [356]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [357]:
import pandas as pd
from zipfile import ZipFile

def load_zip_to_pd(filename):
    #returns dataframe of zipped JSON file
    zip = ZipFile(filename)
    zip.extractall()

    return pd.read_json('data.{}'.format('json'))

In [358]:
from HTMLParser import HTMLParser
import re

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return re.sub('[^A-Za-z]+', ' ', s.get_data())

In [359]:
#load data and add Fraud column
df = load_zip_to_pd('data.zip')

df["Fraud"] = ((df['acct_type'] == 'fraudster') 
               | (df['acct_type'] == 'fraudster_event') 
               | (df['acct_type'] == 'fraudster_att')).astype(int)

In [360]:
# MultinomialNB classifier using text from description
docs_train, docs_test, y_train, y_test = train_test_split(df['description'], df['Fraud'])
vect = TfidfVectorizer(stop_words='english', preprocessor=strip_tags, analyzer='word', max_df=.5)
X_train = vect.fit_transform(docs_train)
X_test = vect.transform(docs_test)
mnnb = MultinomialNB(alpha=.01)
mnnb.fit(X_train, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [361]:
tn, fp, fn, tp = confusion_matrix(y_test, mnnb.predict(X_test)).ravel()
print "MN naive recall:", float(tp) / (tp + fn)
print "MN naive precision:", float(tp) / (tp + fp)
print "Accuracy on MN naive bayes:", mnnb.score(X_test, y_test)

MN naive recall: 0.227963525836
MN naive precision: 0.609756097561
Accuracy on MN naive bayes: 0.915760111576


In [347]:
# train_probas = mnnb.predict_proba(X_train)
# test_probas = mnnb.predict_proba(X_test)

In [362]:
X = vect.fit_transform(df['description'])
y = df['Fraud']

mnnb = MultinomialNB(alpha=.01)
mnnb.fit(X, y)
probas = mnnb.predict_proba(X)[:, 1]

In [363]:
df['NB_probas'] = probas

In [379]:
[str(x) for x  in df_num.columns]

['approx_payout_date',
 'body_length',
 'channels',
 'delivery_method',
 'event_created',
 'event_end',
 'event_published',
 'event_start',
 'fb_published',
 'gts',
 'has_analytics',
 'has_header',
 'has_logo',
 'name_length',
 'num_order',
 'num_payouts',
 'object_id',
 'org_facebook',
 'org_twitter',
 'sale_duration',
 'sale_duration2',
 'show_map',
 'user_age',
 'user_created',
 'user_type',
 'venue_latitude',
 'venue_longitude',
 'Fraud',
 'NB_probas']

In [368]:
df_num = df.select_dtypes(include=[np.number])
df_num.fillna(value=0, inplace=True)

X = df_num.drop(['Fraud'], axis=1)
y = df_num['Fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y)


rf_classifier = RandomForestClassifier().fit(X_train, y_train)


confusion_matrix(y_test, rf_classifier.predict(X_test))

tn, fp, fn, tp = confusion_matrix(y_test, rf_classifier.predict(X_test)).ravel()
print "RF Accuracy", rf_classifier.score(X_test, y_test)
print "RF Recall:", float(tp) / (tp + fn)
print "RF Precision:", float(tp) / (tp + fp)
print "confusion matrix:"
print tp, fp
print fn, tn

RF Accuracy 0.987726638773
RF Recall: 0.913621262458
RF Precision: 0.938566552901
confusion matrix:
275 18
26 3266


In [371]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
print scores

[ 0.9869828   0.9893073   0.98976744  0.98930233  0.98697674]


In [196]:
# log_reg recall: 0.0849673202614
# Accuracy on test set: 0.92189679219

In [83]:
print df[df['Fraud'] == 1]['user_age'].mean()

print df[df['Fraud'] == 0]['user_age'].mean()

87.1523588554
402.683072677


In [84]:
print df[df['Fraud'] == 1]['body_length'].mean()

print df[df['Fraud'] == 0]['body_length'].mean()

1508.8863109
3886.99455688


In [85]:
print df[df['Fraud'] == 1]['currency'].value_counts()

print df[df['Fraud'] == 0]['currency'].value_counts()

USD    858
GBP    343
CAD     42
EUR     37
AUD     12
MXN      1
Name: currency, dtype: int64
USD    8838
GBP    1879
CAD    1280
AUD     767
EUR     225
NZD      55
Name: currency, dtype: int64
