# 1. Importing Packages

In [1]:
### Basic Packages
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ref: https://docs.python.org/3/library/string.html
import re,string,unicodedata
from bs4 import BeautifulSoup

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### NLTK Imports
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import sentiwordnet as swn, wordnet
from nltk.corpus.reader.wordnet import WordNetError
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

In [2]:
# Ref: https://www.nltk.org/data.html
# Ref: https://www.nltk.org/_modules/nltk/corpus.html
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

# 2. Exploration

In [3]:
df_train = pd.read_csv("../input/neuranceai/train.csv")
df_test  = pd.read_csv("../input/neuranceai/test.csv")

In [4]:
print(df_train.info())
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32165 entries, 0 to 32164
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   patient_id                  32165 non-null  int64  
 1   name_of_drug                32165 non-null  object 
 2   use_case_for_drug           32165 non-null  object 
 3   review_by_patient           32165 non-null  object 
 4   effectiveness_rating        32165 non-null  int64  
 5   drug_approved_by_UIC        32165 non-null  object 
 6   number_of_times_prescribed  32165 non-null  int64  
 7   base_score                  32165 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 2.0+ MB
None


Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,8.022969
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969
3,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,6.590176
4,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,6.144782


In [5]:
print(df_test.info())
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10760 entries, 0 to 10759
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   patient_id                  10760 non-null  int64 
 1   name_of_drug                10760 non-null  object
 2   review_by_patient           10760 non-null  object
 3   drug_approved_by_UIC        10760 non-null  object
 4   number_of_times_prescribed  10760 non-null  int64 
 5   use_case_for_drug           10760 non-null  object
 6   effectiveness_rating        10760 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 588.6+ KB
None


Unnamed: 0,patient_id,name_of_drug,review_by_patient,drug_approved_by_UIC,number_of_times_prescribed,use_case_for_drug,effectiveness_rating
0,163740,Mirtazapine,"""I&#039;ve tried a few antidepressants over th...",28-Feb-12,22,Depression,10
1,39293,Contrave,"""Contrave combines drugs that were used for al...",5-Mar-17,35,Weight Loss,9
2,208087,Zyclara,"""4 days in on first 2 weeks. Using on arms an...",3-Jul-14,13,Keratosis,4
3,23295,Methadone,"""Ive been on Methadone for over ten years and ...",18-Oct-16,21,Opiate Withdrawal,7
4,97013,Ambien,"""Ditto on rebound sleepless when discontinued....",13-Jan-15,44,Insomnia,2


In [6]:
print("TRAINING DATASET")
print("#Unique Drug Names: ", len(np.unique(df_train['name_of_drug'])))
print("#Unique Use Cases: ", len(np.unique(df_train['use_case_for_drug'])))

print("\nTEST DATASET")
print("#Unique Drug Names: ", len(np.unique(df_test['name_of_drug'])))
print("#Unique Use Cases: ", len(np.unique(df_test['use_case_for_drug'])))

TRAINING DATASET
#Unique Drug Names:  2220
#Unique Use Cases:  636

TEST DATASET
#Unique Drug Names:  1478
#Unique Use Cases:  461


# 3. Pre-Processing

In [7]:
# Making a list of all the stopwords
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

# A function to determine the tag for every word
# Ref: https://www.nltk.org/api/nltk.tag.html
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# Creating a function to lemmatize the review text
# Ref: https://www.nltk.org/_modules/nltk/stem/wordnet.html
lemmatizer = WordNetLemmatizer()
def lemmatize_words(review_by_patient):
    final_text = []
    for i in review_by_patient.split():
        if i.strip().lower() not in stop_words:
            # Tag of the word, used for lemmatization
            pos = pos_tag([i.strip()]) 
            word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            final_text.append(word.lower())
    return " ".join(final_text)

In [8]:
is_df_train = os.path.isfile("../input/neuranceai/new_df_train.csv")
is_df_test  = os.path.isfile("../input/neuranceai/new_df_test.csv")

if is_df_train and is_df_test:
    new_df_train = pd.read_csv("../input/neuranceai/new_df_train.csv")
    new_df_test = pd.read_csv("../input/neuranceai/new_df_test.csv")
else:
    reviews_train = df_train['review_by_patient']
    reviews_test = df_test['review_by_patient']
    print(reviews_train.shape, reviews_test.shape)

    # Performing Lemmatization
    reviews_train = reviews_train.apply(lemmatize_words)
    reviews_test = reviews_test.apply(lemmatize_words)
    print(reviews_train.shape, reviews_test.shape)

    # Creating a new dataset with lemmatized words
    new_df_train = df_train.drop(['review_by_patient'], axis = 1)
    new_df_test  = df_test.drop(['review_by_patient'], axis = 1)
    print(new_df_train.shape, new_df_test.shape)

    new_df_train = pd.concat([new_df_train, reviews_train], axis = 1)
    new_df_test = pd.concat([new_df_test, reviews_test], axis = 1)
    print(new_df_train.shape, new_df_test.shape)

    new_df_train.to_csv("new_df_train.csv", index = False)
    new_df_test.to_csv("new_df_test.csv", index = False)

In [9]:
# Removing the variables from the memory, only works with one variable at a time
# reset_selective -f <variable>

# To find the variables in the memory
# who_ls

In [10]:
tf_idf = TfidfVectorizer(max_features = 5000)
reviews_train = tf_idf.fit_transform(new_df_train['review_by_patient'])
reviews_test = tf_idf.transform(new_df_test['review_by_patient'])
print(reviews_train.shape, reviews_test.shape)

(32165, 5000) (10760, 5000)


# 4. Preparing the Dataset for modelling purposes

In [11]:
X_train = new_df_train.drop(['patient_id', 'name_of_drug', 'use_case_for_drug', 
    'drug_approved_by_UIC', 'review_by_patient', 'base_score'], axis = 1)
Y_train = new_df_train['base_score']
X_test = new_df_test.drop(['patient_id', 'name_of_drug', 'use_case_for_drug', 
    'drug_approved_by_UIC', 'review_by_patient'], axis = 1)
test_ids = new_df_test['patient_id']

X_train = pd.concat([X_train, pd.DataFrame(reviews_train.toarray())], axis = 1)
X_test = pd.concat([X_test, pd.DataFrame(reviews_test.toarray())], axis = 1)
print("Trainin Set:", X_train.shape, Y_train.shape)
print("Test Set:", X_test.shape, test_ids.shape)

Trainin Set: (32165, 5002) (32165,)
Test Set: (10760, 5002) (10760,)


In [12]:
# Dividing the labelled examples into training and validation examples
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = 0.1)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(28948, 5002) (3217, 5002) (28948,) (3217,)


In [13]:
# reset_selective -f Y_train

# 5. Training the Model

In [14]:
import xgboost as xg

In [15]:
# Instantiation
xgb_r = xg.XGBRegressor(n_estimators=100)

In [16]:
# Fitting the model
xgb_r.fit(x_train, y_train)
 
# Predict the model
preds_train = xgb_r.predict(x_train)
preds_val = xgb_r.predict(x_val)
 
RMSE_train = MSE(y_train, preds_train, squared = False)
RMSE_val = MSE(y_val, preds_val, squared = False)

print("Root Mean Squared Error for Training Set:", RMSE_train)
print("Root Mean Squared Error for Validation Set:", RMSE_val)

Root Mean Squared Error for Training Set: 0.13457613062886667
Root Mean Squared Error for Validation Set: 0.17643779044449706


# 6. Predicting Results

In [17]:
preds_test = xgb_r.predict(X_test)
print(test_ids.shape, preds_test.shape)
preds_test = pd.Series(preds_test)
sam_sub = pd.concat([test_ids, preds_test], axis = 1)
sam_sub.to_csv("sample_submission_100.csv")

(10760,) (10760,)


In [18]:
from sklearn.metrics import r2_score
r2_score(y_train, preds_train)   # best = 1

0.9928140909456578