# Introduction

# 1. Importing Packages

In [None]:
### Basic Packages
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ref: https://docs.python.org/3/library/string.html
import re,string,unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import tensorflow as tf
import tensorflow.keras.layers as tfl

### NLTK Imports
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn, wordnet
from nltk.corpus.reader.wordnet import WordNetError
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

In [None]:
# Ref: https://www.nltk.org/data.html
# Ref: https://www.nltk.org/_modules/nltk/corpus.html
nltk.download('omw-1.4')

In [None]:
np.random.seed(0)
tf.random.set_seed(0)

# 2. Exploration

In [None]:
df_train = pd.read_csv("../input/neuranceai/train.csv")
df_test  = pd.read_csv("../input/neuranceai/test.csv")

In [None]:
print(df_train.info())
df_train.head()

In [None]:
print(df_test.info())
df_test.head()

In [None]:
print("TRAINING DATASET")
print("#Unique Drug Names: ", len(np.unique(df_train['name_of_drug'])))
print("#Unique Use Cases: ", len(np.unique(df_train['use_case_for_drug'])))

print("\nTEST DATASET")
print("#Unique Drug Names: ", len(np.unique(df_test['name_of_drug'])))
print("#Unique Use Cases: ", len(np.unique(df_test['use_case_for_drug'])))

# 3. Pre-Processing

In [None]:
# Making a list of all the stopwords
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

# A function to determine the tag for every word
# Ref: https://www.nltk.org/api/nltk.tag.html
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Creating a function to lemmatize the review text
# Ref: https://www.nltk.org/_modules/nltk/stem/wordnet.html
lemmatizer = WordNetLemmatizer()
def lemmatize_words(review_by_patient):
    final_text = []
    for i in review_by_patient.split():
        if i.strip().lower() not in stop_words:
            # Tag of the word, used for lemmatization
            pos = pos_tag([i.strip()]) 
            word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            final_text.append(word.lower())
    return " ".join(final_text)

In [None]:
is_df_train = os.path.isfile("../input/neuranceai/new_df_train.csv")
is_df_test  = os.path.isfile("../input/neuranceai/new_df_test.csv")

if is_df_train and is_df_test:
    new_df_train = pd.read_csv("../input/neuranceai/new_df_train.csv")
    new_df_test = pd.read_csv("../input/neuranceai/new_df_test.csv")
else:
    reviews_train = df_train['review_by_patient']
    reviews_test = df_test['review_by_patient']
    print(reviews_train.shape, reviews_test.shape)

    # Performing Lemmatization
    reviews_train = reviews_train.apply(lemmatize_words)
    reviews_test = reviews_test.apply(lemmatize_words)
    print(reviews_train.shape, reviews_test.shape)

    # Creating a new dataset with lemmatized words
    new_df_train = df_train.drop(['review_by_patient'], axis = 1)
    new_df_test  = df_test.drop(['review_by_patient'], axis = 1)
    print(new_df_train.shape, new_df_test.shape)

    new_df_train = pd.concat([new_df_train, reviews_train], axis = 1)
    new_df_test = pd.concat([new_df_test, reviews_test], axis = 1)
    print(new_df_train.shape, new_df_test.shape)

    new_df_train.to_csv("new_df_train.csv", index = False)
    new_df_test.to_csv("new_df_test.csv", index = False)

In [None]:
# Removing the variables from the memory, only works with one variable at a time
# reset_selective -f <variable>

# To find the variables in the memory
# who_ls

In [None]:
is_indices_train = os.path.isfile("../input/neuranceai/indices_review_train_100.csv")
is_indices_test  = os.path.isfile("../input/neuranceai/indices_review_test_100.csv")

if is_indices_train and is_indices_test:
    indices_tr = pd.read_csv("../input/neuranceai/indices_review_train_100.csv")
    indices_te = pd.read_csv("../input/neuranceai/indices_review_test_100.csv")
else:
    reviews_train = new_df_train['review_by_patient']
    reviews_test = new_df_test['review_by_patient']

    count_vec = CountVectorizer(max_features = 5000)
    count_vec.fit(reviews_train)
    vocab = count_vec.vocabulary_
    features = count_vec.get_feature_names_out()
    analyzer = count_vec.build_analyzer()
    
    def sentence_to_indices(sentence, max_len = 50):
        indices = []
        words = analyzer(sentence)
        for word in words:
            if word in features:
                indices.append(vocab[word])
        while len(indices) < max_len:
            indices.append(-1)
        return indices[:max_len]

    min_index = min(vocab.values())
    max_index = max(vocab.values())
    print(min_index, max_index)

    indices_train =  []
    for rev in tqdm(reviews_train):
        indices = sentence_to_indices(rev, 50)
        indices_train.append(indices)

    indices_test = []
    for rev in tqdm(reviews_test):
        indices = sentence_to_indices(rev, 50)
        indices_test.append(indices)

    indices_tr = pd.DataFrame(indices_train)
    indices_tr.to_csv("indices_review_train.csv", index = False)
    indices_te = pd.DataFrame(indices_test)
    indices_te.to_csv("indices_review_test.csv", index = False)    
    
print(indices_tr.shape, indices_te.shape)

# 4. Preparing the Dataset for modelling purposes

In [None]:
X_train = new_df_train.drop(['patient_id', 'name_of_drug', 'use_case_for_drug', 
    'drug_approved_by_UIC', 'review_by_patient', 'base_score'], axis = 1)
Y_train = new_df_train['base_score']
X_test = new_df_test.drop(['patient_id', 'name_of_drug', 'use_case_for_drug', 
    'drug_approved_by_UIC', 'review_by_patient'], axis = 1)
test_ids = new_df_test['patient_id']
print("Training Set:", X_train.shape, Y_train.shape)
print("Test Set:", X_test.shape, test_ids.shape)

X_train = pd.concat([X_train, indices_tr], axis = 1)
X_test = pd.concat([X_test, indices_te], axis = 1)
print("Training Set:", X_train.shape, Y_train.shape)
print("Test Set:", X_test.shape, test_ids.shape)

In [None]:
# Dividing the labelled examples into training and validation examples
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = 0.1)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

# 5. Clearing the memory

In [None]:
reset_selective -f df_train

In [None]:
reset_selective -f df_test

In [None]:
reset_selective -f X_train

In [None]:
reset_selective -f Y_train

In [None]:
reset_selective -f indices_tr

In [None]:
reset_selective -f indices_te

# 5. Training the Model

In [None]:
# Training a model based on the 2 numerical features
model1 = LinearRegression()
model1.fit(x_train.iloc[:,:2], y_train)

preds_train = model1.predict(x_train.iloc[:,:2])
preds_val   = model1.predict(x_val.iloc[:,:2])

RMSE_train = MSE(y_train, preds_train, squared = False)
RMSE_val   = MSE(y_val, preds_val, squared = False)

print("Root Mean Squared Error for Training Set:", RMSE_train)
print("Root Mean Squared Error for Validation Set:", RMSE_val)

In [None]:
# One Hot Encoding
def ohe(x, depth):
    # x_new = x[..., np.newaxis]
    x_new = np.zeros((x.shape[0], x.shape[1], depth))
    for i in range(len(x)):
        for j in range(len(x[i])):
            x_new[i][j][x[i][j]] = 1
    return x_new

x_train_oh = ohe(np.array(x_train.iloc[:,2:]), 5000)
x_val_oh   = ohe(np.array(x_val.iloc[:,2:]), 5000)
print(x_train_oh.shape, x_val_oh.shape)

In [None]:
model2 = tf.keras.Sequential([
    tfl.Masking(mask_value = -1),
    tfl.LSTM(units = 64, activation = 'linear', return_sequences = False),
    tfl.Dense(units = 1, activation = 'linear')
])

model2.compile(optimizer='sgd', loss='mae')
model2.fit(x_train_oh, y_train, batch_size = 32, epochs = 5)

In [None]:
preds_train = model2.predict(x_train_oh)
preds_val   = model2.predict(x_val_oh)

RMSE_train = MSE(y_train, preds_train, squared = False)
RMSE_val   = MSE(y_val, preds_val, squared = False)

print("Root Mean Squared Error for Training Set:", RMSE_train)
print("Root Mean Squared Error for Validation Set:", RMSE_val)

- Now, here based on the performance of the 2 models, we can take the weighted sum of predictions, of the 2 models, and use the weighted sum as the final predictions.