# Logistic Regression Using Jarvis

In [1]:
import seaborn as sns
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
sns.set_context("talk")
from sklearn.metrics import roc_curve
from sklearn.linear_model import LogisticRegressionCV
import re

import jarvis
jarvis.groundClient('git')
jarvis.jarvisFile('logistic_regression.ipynb')

## Collecting data

In [2]:
@jarvis.func
def crawl():
    train = pd.read_csv('train.csv')
    train['email'] = train['email'].str.lower()
    test = pd.read_csv('test.csv')
    test['email'] = test['email'].str.lower()
    stop_words = set(pd.read_csv('stopwords.csv')['a'])
    return train, test, stop_words

doCrawl = jarvis.Action(crawl)
train_set = jarvis.Artifact('train_set.pkl', doCrawl)
test_set = jarvis.Artifact('test_set.pkl', doCrawl)
stopwords = jarvis.Artifact('stop_words.pkl', doCrawl)

## Create feature matrix

In [3]:
def words_in_text(words, text):
    return pd.Series([1 if word in text else 0 for word in words])

def prop_punc(punc, text):
    return np.sum([1 if punc == text[i] else 0 for i in range(len(text))]) / len(text)

def proportion_capital(text):
    return len([letter for letter in text if letter.isupper()]) / len(text)

def count_words(arr, stop_words):
    dict = {}
    for email in arr:
        for word in str(email).split():
            if word not in stop_words:
                if word in dict:
                    dict[word] += 1
                else:
                    dict[word] = 1
    return dict

def ngrams(inputs, n):
    inputs = inputs.split(' ')
    output = {}
    for i in range(len(inputs)-n+1):
        g = ' '.join(inputs[i:i+n])
        output.setdefault(g, 0)
        output[g] += 1
    return output

def grams_in_text(gram_list, text):
    text_grams = ngrams(text, 2)
    return pd.Series([1 if gram in text_grams else 0 for gram in gram_list])

def create_row(table, index, most_distinguished_email, most_distinguished_ngrams, most_distinguished_subject):
    row = table.iloc[index, :]
    lst = []
#     lst.append(re.search(r'R[E|e]:', str(row['subject'])) == None)
#     lst.append(re.search(r'F[W|w][D|d]*:', str(row['subject'])) == None)
    lst.append(row['reply'])
    lst.append(row['forward'])
    lst.append(row['length of email'])
    lst.append(row['brackets subject'])
    for p in ['!', '?', '-', ':', '*', '#', '$']:
        lst.append(row['prop ' + p])
    words_email = most_distinguished_email
    lst += list(words_in_text(words_email, row['email']))
    subject_email = most_distinguished_subject[:2]
    lst += list(words_in_text(subject_email, np.array(row['subject'])))
    grams_email = most_distinguished_ngrams
    lst += list(grams_in_text(grams_email, row['email']))
    return np.array(lst)
    

@jarvis.func
def featurize(train, test, stop_words):
    train['reply'] = train['subject'].apply(lambda x: re.search(r'R[E|e]:', str(x))).apply(lambda x: 0 if x == None else 1)
    train['forward'] = train['subject'].apply(lambda x: re.search(r'F[W|w][D|d]*:', str(x))).apply(lambda x: 0 if x == None else 1)
    
    punc = ['!', '?', '.', '-', ':', '*', '#', '$', '%', '<', '>', '@']
    for p in punc:
        train['prop ' + p] = train['email'].apply(lambda e: prop_punc(p, e))
    
    train['capital proportion'] = train['subject'].apply(lambda x: proportion_capital(str(x)))
    train['brackets subject'] = train['subject'].apply(lambda x: re.search(r'\[.*\]', str(x))).apply(lambda x: 0 if x == None else 1)
    train['length of email'] = train['email'].apply(lambda x: len(x))
    train['log length'] = np.log(train['length of email'])
    spam_words = count_words(np.array(train[train['spam'] == 1]['email']), stop_words)
    ham_words = count_words(np.array(train[train['spam'] == 0]['email']), stop_words)
    
    shorter_spam = sorted(spam_words, key=spam_words.get, reverse=True)[:200]
    shorter_ham = sorted(ham_words, key=ham_words.get, reverse=True)[:200]

    prop_diff = {}
    for word in shorter_spam:
        if word in ham_words:
            prop_diff[word] = abs(spam_words[word] - ham_words[word])
        else:
            prop_diff[word] = spam_words[word]
    for word in shorter_ham:
        if word in spam_words:
            prop_diff[word] = abs(spam_words[word] - ham_words[word])
        else:
            prop_diff[word] = ham_words[word]

    mde = sorted(prop_diff, key=prop_diff.get, reverse = True)
    
    spam_subject = count_words(np.array(train[train['spam'] == 1]['subject']), stop_words)
    ham_subject = count_words(np.array(train[train['spam'] == 0]['subject']), stop_words)

    shorter_spam_subject = sorted(spam_subject, key=spam_subject.get, reverse=True)[:100]
    shorter_ham_subject = sorted(ham_subject, key=ham_subject.get, reverse=True)[:100]

    prop_diff_subject = {}
    for word in shorter_spam_subject:
        if word in ham_subject:
            prop_diff_subject[word] = abs(spam_subject[word] - ham_subject[word])
        else:
            prop_diff_subject[word] = spam_subject[word]
    for word in shorter_ham_subject:
        if word in spam_subject:
            prop_diff_subject[word] = abs(spam_subject[word] - ham_subject[word])
        else:
            prop_diff_subject[word] = ham_subject[word]

    mds = sorted(prop_diff_subject, key=prop_diff_subject.get, reverse = True)
    
    ham_ngrams = ngrams(train[train['spam'] == 0]['email'].str.cat(sep=" "), 2)
    spam_ngrams = ngrams(train[train['spam'] == 1]['email'].str.cat(sep=" "), 2)


    short_spam_ngrams = sorted(spam_ngrams, key=spam_ngrams.get, reverse=True)[:200]
    short_ham_ngrams = sorted(ham_ngrams, key=ham_ngrams.get, reverse=True)[:200]

    short_spam_ngrams

    prop_diff_ngrams = {}
    for word in short_spam_ngrams:
        if word in ham_ngrams:
            prop_diff_ngrams[word] = abs(spam_ngrams[word] - ham_ngrams[word])
        else:
            prop_diff_ngrams[word] = spam_ngrams[word]
    for word in short_ham_ngrams:
        if word in shorter_spam:
             prop_diff_ngrams[word] = abs(spam_ngrams[word] - ham_ngrams[word])
        else:
            prop_diff_ngrams[word] = ham_ngrams[word]

    mdn = sorted(prop_diff_ngrams, key=prop_diff_ngrams.get, reverse = True)[:200]
    
    X_train_new = np.array([create_row(train, i, mde, mdn, mds) for i in range(len(train))])
    y_train_new = train['spam']

    X_test = np.array([create_row(train, i, mde, mdn, mds) for i in range(len(test))])
    
    return X_train_new, y_train_new, X_test

doFeaturize = jarvis.Action(featurize, [train_set, test_set, stopwords])
x_tr = jarvis.Artifact('x_tr.pkl', doFeaturize)
y_tr = jarvis.Artifact('y_tr.pkl', doFeaturize)
x_te = jarvis.Artifact('x_te.pkl', doFeaturize)

    

    

## Train model

In [4]:
def find_optimal_cutoff(fpr, tpr, thresholds):
        dict = {}
        for i in range(len(thresholds)):
            dict[thresholds[i]] = tpr[i] - fpr[i]
        return sorted(dict, key = dict.get, reverse = True)[0]

@jarvis.func
def train_model(X_train_new, y_train_new):
    clf = LogisticRegressionCV(cv=5)
    clf.fit(X_train_new, y_train_new)
    y_predicted = clf.predict_proba(X_train_new)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_train_new, y_predicted)
    optimal_cutoff = find_optimal_cutoff(fpr, tpr, thresholds)
    tr_opt_cut = "Optimal_cutoff: {}".format(optimal_cutoff)
    return clf, tr_opt_cut

doTrainModel = jarvis.Action(train_model, [x_tr, y_tr])
model = jarvis.Artifact('model.pkl', doTrainModel)
opt_cutoff= jarvis.Artifact('optimal_cutoff.txt', doTrainModel)    

## Score model

In [5]:
@jarvis.func
def score_model(model, X_train_new, y_train_new):
    tr_acc = "Train Accuracy: {}".format(model.score(X_train_new, y_train_new))
    return tr_acc

doScoreModel = jarvis.Action(score_model, [model, x_tr, y_tr])
output = jarvis.Artifact('output.txt', doScoreModel)
output.peek(lambda x: print(''.join(x)))


Train Accuracy: 0.9900574988021082



In [6]:
output.pull()
output.plot()

[('2', '1'),
 ('1', '0'),
 ('6', '5'),
 ('5', '3'),
 ('5', '4'),
 ('11', '10'),
 ('10', '7'),
 ('10', '8'),
 ('10', '9'),
 ('16', '15'),
 ('15', '12'),
 ('15', '13'),
 ('15', '14'),
 ('12', '10'),
 ('13', '10'),
 ('14', '10'),
 ('7', '5'),
 ('8', '5'),
 ('3', '1'),
 ('7', '1'),
 ('8', '1')]