# Introduction
- This version uses TF-IDF Vectorizer, which considers only the `5000` max_features ordered by term frequency across the corpus.
- For the training, I have used Linear Regression.

# 1. Importing Packages

In [1]:
### Basic Packages
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ref: https://docs.python.org/3/library/string.html
import re,string,unicodedata
from bs4 import BeautifulSoup

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### NLTK Imports
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn, wordnet
from nltk.corpus.reader.wordnet import WordNetError
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

In [2]:
np.random.seed(0)

In [3]:
# Ref: https://www.nltk.org/data.html
# Ref: https://www.nltk.org/_modules/nltk/corpus.html
nltk.download('omw-1.4')

# 2. Exploration

In [4]:
df_train = pd.read_csv("../input/neuranceai/train.csv")
df_test  = pd.read_csv("../input/neuranceai/test.csv")

In [5]:
print(df_train.info())
df_train.head()

In [6]:
print(df_test.info())
df_test.head()

In [7]:
print("TRAINING DATASET")
print("#Unique Drug Names: ", len(np.unique(df_train['name_of_drug'])))
print("#Unique Use Cases: ", len(np.unique(df_train['use_case_for_drug'])))

print("\nTEST DATASET")
print("#Unique Drug Names: ", len(np.unique(df_test['name_of_drug'])))
print("#Unique Use Cases: ", len(np.unique(df_test['use_case_for_drug'])))

# 3. Pre-Processing

In [8]:
# Making a list of all the stopwords
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

# A function to determine the tag for every word
# Ref: https://www.nltk.org/api/nltk.tag.html
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Creating a function to lemmatize the review text
# Ref: https://www.nltk.org/_modules/nltk/stem/wordnet.html
lemmatizer = WordNetLemmatizer()
def lemmatize_words(review_by_patient):
    final_text = []
    for i in review_by_patient.split():
        if i.strip().lower() not in stop_words:
            # Tag of the word, used for lemmatization
            pos = pos_tag([i.strip()]) 
            word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            final_text.append(word.lower())
    return " ".join(final_text)

In [9]:
is_df_train = os.path.isfile("../input/neuranceai/new_df_train.csv")
is_df_test  = os.path.isfile("../input/neuranceai/new_df_test.csv")

if is_df_train and is_df_test:
    new_df_train = pd.read_csv("../input/neuranceai/new_df_train.csv")
    new_df_test = pd.read_csv("../input/neuranceai/new_df_test.csv")
else:
    reviews_train = df_train['review_by_patient']
    reviews_test = df_test['review_by_patient']
    print(reviews_train.shape, reviews_test.shape)

    # Performing Lemmatization
    reviews_train = reviews_train.apply(lemmatize_words)
    reviews_test = reviews_test.apply(lemmatize_words)
    print(reviews_train.shape, reviews_test.shape)

    # Creating a new dataset with lemmatized words
    new_df_train = df_train.drop(['review_by_patient'], axis = 1)
    new_df_test  = df_test.drop(['review_by_patient'], axis = 1)
    print(new_df_train.shape, new_df_test.shape)

    new_df_train = pd.concat([new_df_train, reviews_train], axis = 1)
    new_df_test = pd.concat([new_df_test, reviews_test], axis = 1)
    print(new_df_train.shape, new_df_test.shape)

    new_df_train.to_csv("new_df_train.csv", index = False)
    new_df_test.to_csv("new_df_test.csv", index = False)

In [10]:
# Removing the variables from the memory, only works with one variable at a time
# reset_selective -f <variable>

# To find the variables in the memory
# who_ls

In [11]:
tf_idf = TfidfVectorizer(max_features = 5000)
reviews_train = tf_idf.fit_transform(new_df_train['review_by_patient'])
reviews_test = tf_idf.transform(new_df_test['review_by_patient'])
print(reviews_train.shape, reviews_test.shape)

# 4. Preparing the Dataset for modelling purposes

In [12]:
X_train = new_df_train.drop(['patient_id', 'name_of_drug', 'use_case_for_drug', 
    'drug_approved_by_UIC', 'review_by_patient', 'base_score'], axis = 1)
Y_train = new_df_train['base_score']
X_test = new_df_test.drop(['patient_id', 'name_of_drug', 'use_case_for_drug', 
    'drug_approved_by_UIC', 'review_by_patient'], axis = 1)
test_ids = new_df_test['patient_id']

X_train = pd.concat([X_train, pd.DataFrame(reviews_train.toarray())], axis = 1)
X_test = pd.concat([X_test, pd.DataFrame(reviews_test.toarray())], axis = 1)
print("Trainin Set:", X_train.shape, Y_train.shape)
print("Test Set:", X_test.shape, test_ids.shape)

In [13]:
# Dividing the labelled examples into training and validation examples
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = 0.1)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

In [14]:
reset_selective -f X_train

In [15]:
reset_selective -f Y_train

# 5. Training the Model

In [16]:
lr = LinearRegression()
lr.fit(x_train, y_train)

preds_train = lr.predict(x_train)
preds_val = lr.predict(x_val)

RMSE_train = MSE(y_train, preds_train, squared = False)
RMSE_val = MSE(y_val, preds_val, squared = False)

print("Root Mean Squared Error for Training Set:", RMSE_train)
print("Root Mean Squared Error for Validation Set:", RMSE_val)

# 6. Predicting Results

In [17]:
preds_test = lr.predict(X_test)
print(test_ids.shape, preds_test.shape)

sam_sub = pd.concat([test_ids, pd.Series(preds_test)], axis = 1)
sam_sub.to_csv("sample_submission.csv", index = False)