# Setup

In [2]:
import pandas as pd
from google.colab import drive
import joblib
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# import tensorflow as tf
from tensorflow import keras
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Activation, Dropout
# from tensorflow.keras.callbacks import EarlyStopping

drive.mount("/content/drive")
data_path = "/content/drive/MyDrive/data/final_project/"


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Functions

In [6]:
def load_objects():
  model = keras.models.load_model(f"{data_path}model")
  matrix_title = joblib.load(f"{data_path}matrix_title.pkl")
  matrix_body = joblib.load(f"{data_path}matrix_body.pkl")
  tfidf = joblib.load(f"{data_path}tfidf.pkl")

  return model, matrix_title, matrix_body, tfidf


def process_text(text):
    text = re.sub("[!\"#$%&'()*+,-.\/:;<=>?@\[\]^_`{|}~]", " ", text) # Punctuation
    text = text.lower()

    stop = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop]) # Stopwords

    return text


def predict_tag(title, body, tag=""):
  X_pred = pd.DataFrame({"title": title, "body": body}, index=[0])
  X_pred['body_proc'] = X_pred['body'].apply(process_text)
  X_pred['title_proc'] = X_pred['title'].apply(process_text)

  X_body = pd.DataFrame(matrix_body.transform(X_pred["body_proc"]).toarray())
  X_body.columns = [f"X_body_{i}" for i in X_body.columns]

  X_title = pd.DataFrame(matrix_title.transform(X_pred["title_proc"]).toarray())
  X_title.columns = [f"X_title_{i}" for i in X_title.columns]

  X_pred = pd.concat([X_pred, X_body, X_title], axis=1)

  x_cols = [col for col in X_pred.columns if "X_" in col]
  X_tfidf = tfidf.transform(X_pred[x_cols]).toarray()

  y_cols = ["c", "c#", "c++", "html", "java", "javascript", "php", "python", "r", "sql"]
  y_pred = pd.DataFrame(model.predict(X_tfidf), columns=y_cols)
  pred_tag = y_pred.apply(lambda x: y_pred.columns[x.argmax()], axis=1).iloc[0]

  print(f"Predicted Tag: {pred_tag}")

  if tag != "":
    print(f"Actual Tag: {tag}")

  return pred_tag


def model_demo():
  title = input("Title: ")
  body = input("Body: ")
  print("---")
  print("---")
  print("---")
  pred_tag = predict_tag(title, body)

# Prepare Environment

In [4]:
model, matrix_title, matrix_body, tfidf = load_objects()

# Model Demo

In [7]:
model_demo()

Title: How to group in Pandas query
Body: Not sure how to get a sum per group. I can get it to work in SQL but not Pandas.
---
---
---
Predicted Tag: python
