# Post Here: Subreddit Predictor

## Recommendation API - 1.3

> Using Keras and Pre-Trained Embeddings

---
---

## Model #3 - Neural Networks and Pre-Trained Embeddings

The third iteration of the model for predicting the most appropriate subreddits for a given post will use Keras to architect a neural network to make use of a pre-trained word embedding.

- [reddit self-post classification task dataset](https://www.kaggle.com/mswarbrickjones/reddit-selfposts)


---

### Load _le_ data

In [1]:
# === General imports === #
import pandas as pd
import numpy as np
import os

In [7]:
# === sklearn imports === #
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# === Keras imports === #
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
# === Load the dataset === #
# Tab-separated, 10k rows
df1 = pd.read_csv("dataset.csv", sep="\t")

# Confirm it is correct shape
assert df1.shape[0] == 10000
df1.shape

(10000, 4)

In [31]:
# === Arrange data into feature and target === #

# MVP model only uses 'selftext' feature
X = df1["selftext"]

# Predict the subreddit of each post
y = df1["subreddit"]

print(X.shape, y.shape)

(10000,) (10000,)


In [32]:
# === Encode the target using LabelEncoder === #

# This process naively transforms each class of the target into a number
# from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() # Instantiate a new encoder instance
y = le.fit_transform(y)  # Fit and transform label data

y[:8]

array([920, 931, 161, 827, 669, 822,  39, 650])

In [33]:
# === Tokenize using keras === #
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# import numpy as np

# Parameters
maxlen = 100
max_words = 20000  # Use only top 20k words

# Tokenize
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Build word index and pad sequences
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")
data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(y)
print(f"Shape of data tensor: {data.shape}")
print(f"Shape of label tensor: {labels.shape}")

# Set up train / test
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

Found 56166 unique tokens.
Shape of data tensor: (10000, 100)
Shape of label tensor: (10000,)


---
---

### Model Training

In [34]:
# === LSTM Recurrent Neural Network using Keras === #
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM

# Instantiate the model
model = Sequential()

# Construct the network, layer by layer
model.add(Embedding(max_words, 128))  # Word embedding layer
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(le.classes_), activation="softmax"))

# Compile the network
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_6 (Dense)              (None, 1013)              130677    
Total params: 2,822,261
Trainable params: 2,822,261
Non-trainable params: 0
_________________________________________________________________


In [35]:
# === Train and test the network === #
model.fit(X_train, y_train, 
          batch_size=32,
          epochs=1,
          validation_data=(X_test, y_test))

Train on 8000 samples, validate on 2000 samples


<tensorflow.python.keras.callbacks.History at 0x150ac9a50>

In [None]:
# === Neural Network using Keras Layers === #
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding

# Instantiate the model
model = Sequential()

# Construct the network, layer by layer
model.add(Embedding(max_words, 128, input_length=maxlen))  # Word embedding layer
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))

# Compile the network
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["acc"])
model.summary()

In [27]:
# === Create predictions on test feature === #
y_pred_proba = nb.predict_proba(X_test_sparse)

print(y_pred_proba.shape)

(2000, 1013)


In [28]:
# === For each prediction, find the index with the highest probability === #
# import numpy as np
y_pred = np.argmax(y_pred_proba, axis=1)
y_pred[:10]

array([279, 279, 279, 279, 598, 560, 185, 553, 185, 185])

In [29]:
# === Evaluate performance using precision-at-k === #

def precision_at_k(y_true, y_pred, k=5):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred = np.argsort(y_pred, axis=1)
    y_pred = y_pred[:, ::-1][:, :k]
    arr = [y in s for y, s in zip(y_true, y_pred)]
    return np.mean(arr)

print('precision@1 =', np.mean(y_test == y_pred))
print('precision@3 =', precision_at_k(y_test, y_pred_proba, 3))
print('precision@5 =', precision_at_k(y_test, y_pred_proba, 5))

# 1,000,000 records
# precision@1 = 0.6732724580454097
# precision@3 = 0.8058588351431392
# precision@5 = 0.8481391905231984

# 100,000 records
# precision@1 = 0.45935
# precision@3 = 0.6075
# precision@5 = 0.665

# 10,000 records
# precision@1 = 0.108
# precision@3 = 0.1785
# precision@5 = 0.215

precision@1 = 0.108
precision@3 = 0.1785
precision@5 = 0.215


---

### Generate Predictions from new input

In [18]:
# === Example post === #

# The example comes from 'r/learnprogramming'
post = """I am a new grad looking for a job and currently in the process with a company for a junior backend engineer role. I was under the impression that the position was Javascript but instead it is actually Java. My general programming and "leet code" skills are pretty good, but my understanding of Java is pretty shallow. How can I use the next three days to best improve my general Java knowledge? Most resources on the web seem to be targeting complete beginners. Maybe a book I can skim through in the next few days?

Edit:

A lot of people are saying "the company is a sinking ship don't even go to the interview". I just want to add that the position was always for a "junior backend engineer". This company uses multiple languages and the recruiter just told me the incorrect language for the specific team I'm interviewing for. I'm sure they're mainly interested in seeing my understanding of good backend principles and software design, it's not a senior lead Java position."""

In [23]:
# === Function to serve predictions === #
# The main functionality of the predict API endpoint

def predict(post: str, n: int = 5) -> dict:
    """
    Serve subreddit predictions.
    
    Parameters
    ----------
    post : string
        Selftext that needs a home.
    n    : integer
        The desired name of the output file,
        not including the '.pkl' extension.

    Returns
    -------
    Python dictionary formatted as follows:
        [{'subreddit': 'PLC', 'proba': 0.014454},
         ...
         {'subreddit': 'Rowing', 'proba': 0.005206}]
    """
    
    # Vectorize the post -> sparse doc-term matrix
    post_vec = vocab.transform([post])
    
    # Generate predicted probabilities from trained model
    proba = nb.predict_proba(post_vec)
    
    # Wrangle into correct format
    return (pd
                .DataFrame(proba, columns=[le.classes_])  # Classes as column names
                .T  # Transpose so column names become index
                .reset_index()  # Pull out index into a column
                .rename(columns={"level_0": "subreddit", 0: "proba"})  # Rename for aesthetics
                .sort_values(by="proba", ascending=False)  # Sort by probability
                .iloc[:n]  # n-top predictions to serve
                .to_dict(orient="records")
               )

In [24]:
# === Test out the function === #
post_pred = predict(post)  # Default is 5 results
post_pred

[{'subreddit': 'PLC', 'proba': 0.014454003455867464},
 {'subreddit': 'sales', 'proba': 0.008377114822879663},
 {'subreddit': 'AskHR', 'proba': 0.007373675677628392},
 {'subreddit': 'OccupationalTherapy', 'proba': 0.00561548847793111},
 {'subreddit': 'Rowing', 'proba': 0.005206212277479405}]

In [30]:
# === Test it out with another dummy post === #

# This one comes from r/suggestmeabook
post2 = """I've been dreaming about writing my own stort story for a while but I want to give it an unexpected ending. I've read lots of books, but none of them had the plot twist I want. I want to read books with the best plot twists, so that I can analyze what makes a good plot twist and write my own story based on that points. I don't like romance novels and I mostly enjoy sci-fi or historical books but anything beside romance novels would work for me, it doesn't have to be my type of novel. I'm open to experience after all. I need your help guys. Thanks in advance."""

In [31]:
# === This time with 10 results === #
post2_pred = predict(post2, n=10)
post2_pred

[{'subreddit': 'PLC', 'proba': 0.008913827946281429},
 {'subreddit': 'JUSTNOMIL', 'proba': 0.005364412871980992},
 {'subreddit': 'vaginismus', 'proba': 0.005343515334435943},
 {'subreddit': 'Rowing', 'proba': 0.004869320257487137},
 {'subreddit': 'hookah', 'proba': 0.00434668954125044},
 {'subreddit': 'dresdenfiles', 'proba': 0.004295750109887567},
 {'subreddit': 'StudentLoans', 'proba': 0.004077573292318194},
 {'subreddit': 'productivity', 'proba': 0.004050498345338448},
 {'subreddit': 'flexibility', 'proba': 0.003925624264099203},
 {'subreddit': 'dpdr', 'proba': 0.003923252551916036}]

---

### Picklization

In [61]:
# === Create pickle func to make pickling (a little) easier === #

def picklizer(to_pickle, filename, path):
    """
    Creates a pickle file.
    
    Parameters
    ----------
    to_pickle : Python object
        The trained / fitted instance of the 
        transformer or model to be pickled.
    filename : string
        The desired name of the output file,
        not including the '.pkl' extension.
    path : string or path-like object
        The path to the desired output directory.
    """
    import os
    import pickle

    # Create the path to save location
    picklepath = os.path.join(path, filename)

    # Use context manager to open file
    with open(picklepath, "wb") as p:
        pickle.dump(to_pickle, p)

In [62]:
# === Picklize! === #
filepath = "../assets"

# Export pipeline as pickle
picklizer(vocab, "03_pipe.pkl", filepath)