In [25]:
import pandas as pd
from zipfile import ZipFile

def load_zip_to_pd(filename):
    #returns dataframe of zipped JSON file
    zip = ZipFile(filename)
    zip.extractall()

    return pd.read_json('data.{}'.format('json'))

In [78]:
df = load_zip_to_pd('data.zip')

In [79]:
df['acct_type'].value_counts()

premium             12373
fraudster_event       851
fraudster             437
spammer_limited       218
spammer_warn          144
tos_warn               91
spammer_noinvite       84
tos_lock               77
locked                 54
fraudster_att           5
spammer_web             2
spammer                 1
Name: acct_type, dtype: int64

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14337 entries, 0 to 14336
Data columns (total 44 columns):
acct_type             14337 non-null object
approx_payout_date    14337 non-null int64
body_length           14337 non-null int64
channels              14337 non-null int64
country               14256 non-null object
currency              14337 non-null object
delivery_method       14321 non-null float64
description           14337 non-null object
email_domain          14337 non-null object
event_created         14337 non-null int64
event_end             14337 non-null int64
event_published       14238 non-null float64
event_start           14337 non-null int64
fb_published          14337 non-null int64
gts                   14337 non-null float64
has_analytics         14337 non-null int64
has_header            8928 non-null float64
has_logo              14337 non-null int64
listed                14337 non-null object
name                  14337 non-null object
name_length      

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [144]:
from HTMLParser import HTMLParser
import re

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return re.sub('[^A-Za-z]+', ' ', s.get_data())

In [199]:
docs_train, docs_test, y_train, y_test = train_test_split(df['description'], df['Fraud'])
vect = TfidfVectorizer(stop_words='english', preprocessor=strip_tags, analyzer='word', max_df=.5)
X_train = vect.fit_transform(docs_train)
X_test = vect.transform(docs_test)
mnnb = MultinomialNB(alpha=.01)
mnnb.fit(X_train, y_train)
tn, fp, fn, tp = confusion_matrix(y_test, mnnb.predict(X_test)).ravel()
print "MN naive recall:", float(tp) / (tp + fn)
print "MN naive precision:", float(tp) / (tp + fp)
print "Accuracy on MN naive bayes:", mnnb.score(X_test, y_test)

MN naive recall: 0.265573770492
MN naive precision: 0.554794520548
Accuracy on MN naive bayes: 0.919386331939


In [252]:
# vect = TfidfVectorizer(stop_words='english', preprocessor=strip_tags, analyzer='word', max_df=.5)
# X = vect.transform(df['description'])
# y = df['Fraud']
# mnnb = MultinomialNB(alpha=.01)
# mnnb.fit(X, y)
# mnnb.score(X, y)


In [235]:
mnnb

TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2',
        preprocessor=<function strip_tags at 0x1b44b8e60>, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [196]:
# log_reg recall: 0.0849673202614
# Accuracy on test set: 0.92189679219

In [136]:
from sklearn.metrics import confusion_matrix

In [140]:
tn, fp, fn, tp = confusion_matrix(y_test, log_reg.predict(X_test)).ravel()

In [141]:
print tp, fp
print fn, tn

23 0
318 3244


In [142]:
confusion_matrix(y_test, log_reg.predict(X_test))

array([[3244,    0],
       [ 318,   23]])

In [82]:
df["Fraud"] = ((df['acct_type'] == 'fraudster') 
               | (df['acct_type'] == 'fraudster_event') 
               | (df['acct_type'] == 'fraudster_att')).astype(int)

In [83]:
print df[df['Fraud'] == 1]['user_age'].mean()

print df[df['Fraud'] == 0]['user_age'].mean()

87.1523588554
402.683072677


In [84]:
print df[df['Fraud'] == 1]['body_length'].mean()

print df[df['Fraud'] == 0]['body_length'].mean()

1508.8863109
3886.99455688


In [85]:
print df[df['Fraud'] == 1]['currency'].value_counts()

print df[df['Fraud'] == 0]['currency'].value_counts()

USD    858
GBP    343
CAD     42
EUR     37
AUD     12
MXN      1
Name: currency, dtype: int64
USD    8838
GBP    1879
CAD    1280
AUD     767
EUR     225
NZD      55
Name: currency, dtype: int64


In [162]:
import numpy as np

In [164]:
df_num = df.select_dtypes(include=[np.number])

df_num.fillna(value=0, inplace=True)

In [250]:
from sklearn.ensemble import RandomForestClassifier

X = df_num.drop(['Fraud'], axis=1)
y = df_num['Fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y)

rf_classifier = RandomForestClassifier().fit(X_train, y_train)


confusion_matrix(y_test, rf_classifier.predict(X_test))

tn, fp, fn, tp = confusion_matrix(y_test, rf_classifier.predict(X_test)).ravel()
print "RF Accuracy", rf_classifier.score(X_test, y_test)
print "RF Recall:", float(tp) / (tp + fn)
print "RF Precision:", float(tp) / (tp + fp)


Grad Accuracy 0.976290097629
Grad Recall: 0.813609467456
Grad Precision: 0.925925925926
RF Accuracy 0.97489539749
RF Recall: 0.810650887574
RF Precision: 0.913333333333


In [236]:
from sklearn.ensemble import VotingClassifier