In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [3]:
#reading the data
train = pd.read_csv('train.tsv',delimiter='\t')
test = pd.read_csv('test.tsv',delimiter='\t')

#dimensions 
print("Shape of train :", train.shape)
print("Shape of test :", test.shape)

Shape of train : (161297, 7)
Shape of test : (53766, 7)


In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [7]:
len(train['drugName'].unique())

3436

**Preprocessing**

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   161297 non-null  int64  
 1   drugName     161297 non-null  object 
 2   condition    160398 non-null  object 
 3   review       161297 non-null  object 
 4   rating       161297 non-null  float64
 5   date         161297 non-null  object 
 6   usefulCount  161297 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 8.6+ MB


In [9]:
train.isnull().sum()/len(train)

Unnamed: 0     0.000000
drugName       0.000000
condition      0.005574
review         0.000000
rating         0.000000
date           0.000000
usefulCount    0.000000
dtype: float64

In [10]:
#since the percent of null values is less than 1%
train.dropna(inplace=True)
train.reset_index(inplace=True)

In [11]:
train.groupby(['condition']).nunique()

Unnamed: 0_level_0,index,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0</span> users found this comment helpful.,104,104,42,1,104,10,99,1
10</span> users found this comment helpful.,28,28,28,1,28,9,28,1
110</span> users found this comment helpful.,1,1,1,1,1,1,1,1
11</span> users found this comment helpful.,10,10,10,1,10,6,10,1
121</span> users found this comment helpful.,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...
unctional Gastric Disorde,1,1,1,1,1,1,1,1
ungal Infection Prophylaxis,1,1,1,1,1,1,1,1
ungal Pneumonia,1,1,1,1,1,1,1,1
von Willebrand's Disease,7,7,3,1,5,1,5,4


*There are some comments in condition column which are not helpful.*

In [12]:
#dropping unwanted observations
x = train[train["condition"].str.contains("</span>",na=False)].index
train.drop(train.index[x],inplace=True)

In [15]:
train.groupby(['condition']).nunique()

Unnamed: 0_level_0,index,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ADHD,3383,3383,55,1,2192,10,1531,170
AIDS Related Wasting,5,5,3,1,4,4,4,4
AV Heart Block,1,1,1,1,1,1,1,1
Abdominal Distension,2,2,2,1,1,1,1,1
Abnormal Uterine Bleeding,2096,2096,74,1,1340,10,930,85
...,...,...,...,...,...,...,...,...
unctional Gastric Disorde,1,1,1,1,1,1,1,1
ungal Infection Prophylaxis,1,1,1,1,1,1,1,1
ungal Pneumonia,1,1,1,1,1,1,1,1
von Willebrand's Disease,7,7,3,1,5,1,5,4


In [13]:
len(x)

900

In [16]:
train.head()

Unnamed: 0.1,index,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [17]:
def clean(x): 
    return ' '.join(re.sub("&#039|&amp;|(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", x).split())

In [18]:
train['review'] = [clean(x) for x in train['review']]

In [19]:
train.head()

Unnamed: 0.1,index,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,0,206461,Valsartan,Left Ventricular Dysfunction,It has no side effect I take it in combination...,9.0,"May 20, 2012",27
1,1,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,"April 27, 2010",192
2,2,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,"December 14, 2009",17
3,3,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,"November 3, 2015",10
4,4,35696,Buprenorphine / naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,"November 27, 2016",37


In [20]:
index_1 = train.groupby(['drugName']).count().sort_values(by='condition')[:797].index
index_2 = train[train['drugName'].isin(list(index_1))].index
train.drop(list(index_2),inplace=True)

In [21]:
index_3 = train.groupby(['condition']).count().sort_values(by='drugName')[:117].index
index_4 = train[train['condition'].isin(list(index_3))].index
train.drop(list(index_4),inplace=True)

In [22]:
train.head(10)

Unnamed: 0.1,index,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,0,206461,Valsartan,Left Ventricular Dysfunction,It has no side effect I take it in combination...,9.0,"May 20, 2012",27
1,1,95260,Guanfacine,ADHD,My son is halfway through his fourth week of I...,8.0,"April 27, 2010",192
2,2,92703,Lybrel,Birth Control,I used to take another oral contraceptive whic...,5.0,"December 14, 2009",17
3,3,138000,Ortho Evra,Birth Control,This is my first time using any form of birth ...,8.0,"November 3, 2015",10
4,4,35696,Buprenorphine / naloxone,Opiate Dependence,Suboxone has completely turned my life around ...,9.0,"November 27, 2016",37
5,5,155963,Cialis,Benign Prostatic Hyperplasia,2nd day on 5mg started to work with rock hard ...,2.0,"November 28, 2015",43
6,6,165907,Levonorgestrel,Emergency Contraception,He pulled out but he cummed a bit in me I took...,1.0,"March 7, 2017",5
7,7,102654,Aripiprazole,Bipolar Disorde,Abilify changed my life There is hope I was on...,10.0,"March 14, 2015",32
8,8,74811,Keppra,Epilepsy,I Ve had nothing but problems with the Keppera...,1.0,"August 9, 2016",11
9,9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,I had been on the pill for many years When my ...,8.0,"December 8, 2016",1


In [23]:
train.shape

(158584, 8)

In [24]:
train.drop(['index','Unnamed: 0'],axis=1,inplace=True)

In [25]:
train.to_csv("train_new.csv")