# LOGISTIC REGRESSION WITH L2 REGULARIZATION

# `Algorithm`
## `Goal to maximize log likelihood function`

### `Log Likelihood simplifyed form:`
### $$\ell\ell(\mathbf{w}) = \sum_{i=1}^N \Big( (y^{(i)} - 1)\mathbf{w}^T \mathbf{x}^{(i)} - \ln\left(1 + \exp(-\mathbf{w}^T \mathbf{x}^{(i)}\right) \Big)\color{red}{-\lambda\|\mathbf{w}\|_2^2} $$
$$P(\mathbf{x}^{(i)}, \mathbf{w}) = \frac{\mbox 1}{\mbox 1 + e^{-\mathbf{w}^T\mathbf{x}^{(i)}}}$$
<br></br>
### `Gradient Ascent step:`
### $$ \mathbf{w}^{new} = \mathbf{w}^{old} + \lambda \mathbf{X}^T(y^{(i)} - P(\mathbf{X},\mathbf{w})) \color{red}{-2\lambda \mathbf{w} } $$

In [30]:
class LogisticRegression:
    def __init__(self, l2=0.01, step_size=1e-6, n_rounds=150):
        self.l2 = l2
        self.step_size = step_size
        self.n_rounds = n_rounds
        
    def cost_function_with_l2(self, X, y, l2_penalty, coefficients):
        scores = X @ coefficients
        logexp = np.log(1. + np.exp(-scores))

        # Simple check to prevent overflow
        mask = np.isinf(logexp)
        logexp[mask] = -scores[mask]

        return np.sum((y-1)*scores - logexp) - l2_penalty*np.sum(coefficients[1:]**2)
    
    def predict(self, X):
        return X @ self.weights
    
    def fit(self, X, y):
        weights = np.zeros(X.shape[1])
        cost_history = list()
        for itr in range(self.n_rounds):
            penalty = (2 * self.l2 * weights)
            penalty[0] = 0
            
            errors = y - self.__sigmoid(X, weights)
            weights = weights + self.step_size * ((X.T @ errors) - penalty)
            
            logloss = self.cost_function_with_l2(X, y, self.l2, weights)
            cost_history.append(logloss)
        
        self.weights = weights
        return (weights, cost_history)

    def __sigmoid(self, X, w):
        return 1/(1 + np.exp(-X @ w))

## Predict product review sentiment

In [3]:
import pandas as pd
import numpy as np

### Load and explore data

In [5]:
products = pd.read_csv("./data/amazon_baby_subset.csv")

In [6]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [7]:
products["sentiment"] = products["sentiment"].apply(lambda x: x if x==1 else 0)
print('# of positive reviews =', len(products[products['sentiment']==1]))
print('# of negative reviews =', len(products[products['sentiment']==0]))

# of positive reviews = 26579
# of negative reviews = 26493


In [8]:
products["review"].isnull().sum()

241

In [9]:
products["review"] = products["review"].fillna("")

### Apply text cleaning on the review data

In [10]:
import json
with open('./data/important_words.json', 'r') as f: # Reads the list of most frequent words
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

In [11]:
import string 
def remove_punctuation(text):
    translator = text.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text

products['review_clean'] = products['review'].apply(remove_punctuation)

### Calculate words frequency

In [12]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [13]:
train = products.sample(frac=0.8, random_state=0) 
test = products.drop(train.index)

In [14]:
def get_numpy_data(df, features, label):
    df['intercept'] = 1
    features = ['intercept'] + features
    features_df = df[features]
    return(np.array(features_df), np.array(df[label]))

In [17]:
X_train, y_train = get_numpy_data(train, important_words, 'sentiment') 
X_test, y_test = get_numpy_data(test, important_words, 'sentiment') 

## Explore effects of L2 regularization

In [22]:
# run with L2 = 0
log_reg_l2_0 = LogisticRegression(l2=0, step_size=5e-6, n_rounds=300)
coefficients_0_penalty, cost_history_0_penalty  = log_reg_l2_0.fit(X_train, y_train)

In [25]:
cost_history_0_penalty

[-29245.050518621658,
 -29068.355734868815,
 -28898.633641431283,
 -28735.22214805927,
 -28577.622420128424,
 -28425.439032714818,
 -28278.343956878092,
 -28136.054729950574,
 -27998.321079891037,
 -27864.9166134508,
 -27735.63355360325,
 -27610.279321653932,
 -27488.674240599634,
 -27370.649924577014,
 -27256.048092730663,
 -27144.71965027494,
 -27036.523942251515,
 -26931.328123025127,
 -26829.006606949817,
 -26729.440578958292,
 -26632.517551744397,
 -26538.130960921586,
 -26446.179792358078,
 -26356.56823759004,
 -26269.20537425588,
 -26184.00486914862,
 -26100.884701907653,
 -26019.76690765924,
 -25940.577337121293,
 -25863.24543284433,
 -25787.704020385332,
 -25713.889113315512,
 -25641.739731053916,
 -25571.197728598454,
 -25502.2076372987,
 -25434.716515880624,
 -25368.673810993612,
 -25304.031226605846,
 -25240.742601625167,
 -25178.763795169452,
 -25118.05257895434,
 -25058.568536305756,
 -25000.27296734186,
 -24943.128799902974,
 -24887.100505839302,
 -24832.154022295257,
 -

In [29]:
# run with L2 = 4
log_reg_l2_4 = LogisticRegression(l2=4, step_size=5e-6, n_rounds=300)
coefficients_4_penalty, cost_history_4_penalty  = log_reg_l2_4.fit(X_train, y_train)

  if __name__ == '__main__':


KeyboardInterrupt: 

In [None]:
cost_history_4_penalty