In [287]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
import pickle

In [13]:
file = 'files/data.json'

In [297]:
df = pd.read_json('files/data.json')

In [9]:
df['acct_type'].unique()

array(['fraudster_event', 'premium', 'spammer_warn', 'fraudster',
       'spammer_limited', 'spammer_noinvite', 'locked', 'tos_lock',
       'tos_warn', 'fraudster_att', 'spammer_web', 'spammer'], dtype=object)

In [16]:
df.columns

Index(['acct_type', 'approx_payout_date', 'body_length', 'channels', 'country',
       'currency', 'delivery_method', 'description', 'email_domain',
       'event_created', 'event_end', 'event_published', 'event_start',
       'fb_published', 'gts', 'has_analytics', 'has_header', 'has_logo',
       'listed', 'name', 'name_length', 'num_order', 'num_payouts',
       'object_id', 'org_desc', 'org_facebook', 'org_name', 'org_twitter',
       'payee_name', 'payout_type', 'previous_payouts', 'sale_duration',
       'sale_duration2', 'show_map', 'ticket_types', 'user_age',
       'user_created', 'user_type', 'venue_address', 'venue_country',
       'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state'],
      dtype='object')

In [14]:
fraudy_boiz = ['fraudster_event', 'fraudster', 'fraudster_att']

In [11]:
def create_response_label(json_file, fraudulent_types):
    '''
    Take a json file and acct_types defined as fraud, return DataFrame with "fraud" column created.
    Events associated with fraudulent acct_types are labeled 1, others labeled 0.

    :param json_file: json file of data
    :param fraudulent_types: list of strings defining which acct_types are fraudulent
    :return: DataFrame with "fraud" column added
    '''
    df = pd.read_json(json_file)
    df['fraud'] = df['acct_type'].isin(fraudulent_types).astype(int)
    return df

In [45]:
df = create_response_label(file, fraudy_boiz)

In [46]:
df

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,...,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state,fraud
0,fraudster_event,1266062400,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,...,36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL,1
1,premium,1296720000,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,...,149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC,0
2,premium,1296172800,2601,8,US,USD,1.0,"<h3><span class=""subcategory""><strong>Teacher ...",pvsd.k12.ca.us,1291090956,...,214,1272559388,3,10100 Pioneer Blvd Suite 100,US,33.944201,-118.080419,Los Angeles County Office of Education,CA,0
3,premium,1388966400,12347,6,IE,EUR,1.0,"<p style=""margin-bottom: 1.3em; padding-bottom...",irishtabletennis.com,1360681570,...,889,1283870102,3,,,,,,,0
4,premium,1297900800,2417,11,US,USD,0.0,<p>Writers and filmmakers need to understand t...,artsandbusinesscouncil.org,1291994666,...,35,1288984065,3,One Marina Park Drive,US,42.353848,-71.044276,Fish & Richardson,MA,0
5,premium,1300496400,117,6,US,USD,0.0,<p>Come join the Bluegrass Stallions as they t...,bluegrassstallions.com,1294421810,...,299,1268579110,1,,US,38.209797,-84.558831,Georgetown College Alumni Gymnasium,KY,0
6,premium,1297909800,28,5,US,USD,1.0,<p> </p>\r\n<p> </p>\r\n<p> </p>,naboe.org,1294425018,...,706,1233437951,3,6N East St,US,39.414270,-77.405089,Danielle's Restaurant,MD,0
7,premium,1296709200,974,12,US,USD,0.0,<p><em>WHEN JEKYLL MET HYDE</em></p>\r\n<p>The...,themagnetictheatre.org,1294427837,...,71,1288276103,3,372 Depot Street,US,35.580468,-82.563855,The Magnetic Field,NC,0
8,premium,1298185200,4388,4,US,USD,0.0,"<p style=""text-align: center;""><font face=""boo...",credosf.com,1294428122,...,0,1294428121,3,465 California Street,US,37.792847,-122.402082,,,0
9,premium,1297314000,974,12,US,USD,0.0,<p><em>WHEN JEKYLL MET HYDE</em></p>\r\n<p>The...,themagnetictheatre.org,1294428286,...,71,1288276103,3,372 Depot Street,US,35.580468,-82.563855,The Magnetic Field,NC,0


In [35]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    s = s.get_data()
    return re.sub('[^A-Za-z]+', ' ', s)

In [37]:
df_desc_test = strip_tags(df['description'])

In [38]:
df_desc_test

' Party Starz Entertaintment Diverse International Group Presents The Official Hour No Sleep Super Bowl Weekend Grand Finale No matter who wins or loses this post game party cannot be missed Enjoy the drink specials and all night music The ultimate No Sleep Super Bowl Weekend Grand Finale Sunday February th Ink Nightclub South Beach Washington Ave Miami Bch FL Hosted by Jamz with special appearances by Fast Life Youngstaz Special invited guests Billy Blue Dorrough and Brisco and with performances by various artist s and many many more at South Beach Miami s finest venue This is a party you have to see to believe This is everyone s last chance to make that impression so don t half step bring your A game and party with all of the hottest artists celebrities models and superstar athletes that are guaranteed to be in the building DJ Q DJ Slym will ber keepig the party jumping til the sun comes up with the hottest Hip Hop R B Reggae Old School and more Doors open pm Ladies are and for Gents

In [74]:
X = df['description']
y = df['fraud']

In [None]:
# def create_train_test():
#     docs = pd.read_table('data/train.txt', header=None)
#     docs = docs.rename(index=str, columns={0: "Text"})
#     labels = pd.read_table('data/labels.txt', header=None)
#     labels = labels.rename(index=str, columns={0: "Label"})
#     X_train, X_test, y_train, y_test = train_test_split(docs, labels, test_size=.2)
#     return X_train, X_test, y_train, y_test

def create_stemmer():
    stemmer = SnowballStemmer(language='english')
    analyzer = TfidfVectorizer(stop_words='english').build_analyzer()

def stem_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

def pipeline(X_train, y_train):
    text_clf = Pipeline([('tfidf', TfidfVectorizer(analyzer=stem_words())), ('clf', MultinomialNB(alpha=.5))])
    text_clf = text_clf.fit(X_train['Text'], y_train['Label'])
    return text_clf

def get_score(text_clf, X_test, y_test)
    predicted = text_clf.predict(X_test['Text'])
    score = np.mean(predicted == y_test['Label'])
    return score

In [80]:
example_df = pd.read_json('files/example.json')

In [81]:
example_df

Unnamed: 0,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,1364360400,1474,11,US,USD,0,"<p><span style=""font-size: medium; font-family...",dreamprojectfoundation.org,1361291193,1363928400,...,"{'event_id': 5558108, 'cost': 50.0, 'availabil...",0,1361290985,1,1 Ionia Avenue Southwest,US,42.963058,-85.670615,Grand Rapids Brewing Co,MI


In [86]:
test_script_examples_df = df.sample(20)

In [87]:
test_script_examples_df

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,...,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state,fraud
1928,premium,1375416000,830,6,GB,GBP,0.0,"<p><span style=""font-size: x-large;"">Join us!<...",yahoo.com,1370557376,...,847,1297367430,3,Great Hampton Row,GB,52.490695,-1.905984,St Georges Community Hub,West Mids,0
6019,premium,1299060000,1184,5,US,USD,0.0,"<p style=""text-align: center;""><strong><font s...",hotmail.com,1297803219,...,46,1293837352,3,301 college main,US,30.619132,-96.346734,TheDrink,,0
4175,premium,1341399600,13514,0,US,USD,0.0,"<p style=""text-align: center;"" align=""center"">...",southsideent.com,1338415569,...,106,1329272713,1,251 W 30th St,US,40.749482,-73.994247,Rebel NYC,NY,0
8712,premium,1343332800,5164,5,US,USD,0.0,"<div style=""vertical-align: top; outline-width...",gmail.com,1340856492,...,203,1323287590,3,20425 N. 7th St,US,33.672211,-112.064538,Trillium Apartments Theatre Room,AZ,0
11757,premium,1352934300,2956,6,CA,CAD,1.0,"<p style=""text-align: center;""><span style=""fo...",euro-sports.ca,1350702936,...,497,1307740603,3,13 Bullman St,CA,45.403773,-75.731673,Euro-Sports Bicycle Shop,ON,0
12589,premium,1325980800,7255,6,US,USD,1.0,"<p> </p>\r\n<ul>\r\n<li style=""list-style-posi...",chicagobeerexperience.com,1324673405,...,296,1299103290,3,,,,,,,0
14090,premium,1330390800,1493,11,US,USD,3.0,"<p style=""font-weight: bold;"">Conference Atten...",faro.com,1322767432,...,0,1322767430,3,6000 W. Osceola Pkwy,US,28.342461,-81.526494,Gaylord Palms Resort,FL,0
11954,fraudster_event,1374370200,86,0,DZ,GBP,0.0,"<p>OPEP Corporate meeting in Istanbul, hosted ...",monkeyadvert.com,1373662289,...,0,1373662288,1,34 Taksim Yağhanesi Sokak,TR,41.037117,28.980936,Şht. Muhtar Mh.,Istanbul,1
10106,premium,1328074200,2787,11,GB,GBP,0.0,"<div style=""color: #000000; font-family: Verda...",msn.com,1326293470,...,29,1323791506,3,356 Holloway Rd,GB,51.555845,-0.116227,,Gt Lon,0
8941,premium,1377819000,7675,8,US,NZD,0.0,"<p style=""text-align: center;""><span style=""fo...",mindfulnessworks.co.nz,1371526982,...,14,1370316197,1,31 Hobson Crescent,NZ,-41.273331,174.780195,31 Hobson Crescent,Wellington,0


In [88]:
pwd

'/Users/Katie/Desktop/Galvanize/Full_Course/Daily_Materials/Week_9/Day_3/dsi-fraud_detection_case_study'

In [169]:
test_script_examples_df.to_json(path_or_buf='files/test_script_examples.json', orient='index')

In [291]:
def make_prediction(file, model_pickle):
    df = pd.read_json(file, orient='index')
    first_row = df.iloc[0:1]
    X = first_row.drop('fraud', axis=1)
    with open(model_pickle, 'rb') as f:
        model = pickle.load(f)
    first_row['predict_proba'] = model.predict_proba(X)
    return first_row

In [296]:
make_prediction('files/test_script_examples.json', 'files/model.pkl')

ModuleNotFoundError: No module named 'cPickle'

In [295]:
 with open('files/model.pkl', 'rb') as f:
        model = pickle.load(f)

AttributeError: Can't get attribute 'Classifier' on <module '__main__'>