In [None]:
!pip install --upgrade fastpunct

In [20]:
import torch
import numpy as np
from dataset import*
from basemodel import*
from utils import*
from torch.autograd import Variable
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [18]:
!gdown --id "1H8lDkPzYbhn1_mezi9hnbIUrFhSTDvRZ" # down loading chrome_reviews.csv
!gdown --id "1-0Q78JNERCaTgle8Ez7UW8T_rsOJe3bI" # downloading glove_file.txt 
!gdown --id "1GTv6aDXnhLa706AK2Cit6ktgk5imSNtE" # downloading vocab_file.txt

df=pd.read_csv('./chrome_reviews.csv')
df = df[df['Text'].notna()]
df['Text']=df['Text'].apply(lambda x:remove_emoji(x))
df = df[df['Text']!=""]
print(df.shape)
df.head()


Downloading...
From: https://drive.google.com/uc?id=1H8lDkPzYbhn1_mezi9hnbIUrFhSTDvRZ
To: /content/drive/My Drive/assign_comp/assign_comp/chrome_reviews.csv
100% 1.98M/1.98M [00:00<00:00, 31.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-0Q78JNERCaTgle8Ez7UW8T_rsOJe3bI
To: /content/drive/My Drive/assign_comp/assign_comp/glove.6B.50d.txt
171MB [00:01, 139MB/s]
Downloading...
From: https://drive.google.com/uc?id=1GTv6aDXnhLa706AK2Cit6ktgk5imSNtE
To: /content/drive/My Drive/assign_comp/assign_comp/vocab_star.txt
203MB [00:01, 182MB/s]
(7106, 10)


Unnamed: 0,ID,Review URL,Text,Star,Thumbs Up,User Name,Developer Reply,Version,Review Date,App ID
0,3886,https://play.google.com/store/apps/details?id=...,This is very helpfull aap.,5,0,INDIAN Knowledge,,83.0.4103.106,2020-12-19,com.android.chrome
1,3887,https://play.google.com/store/apps/details?id=...,Good,3,2,Ijeoma Happiness,,85.0.4183.127,2020-12-19,com.android.chrome
2,3888,https://play.google.com/store/apps/details?id=...,Not able to update. Neither able to uninstall.,1,0,Priti D BtCFs-29,,85.0.4183.127,2020-12-19,com.android.chrome
3,3889,https://play.google.com/store/apps/details?id=...,Nice app,4,0,Ajeet Raja,,77.0.3865.116,2020-12-19,com.android.chrome
4,3890,https://play.google.com/store/apps/details?id=...,Many unwanted ads,1,0,Rams Mp,,87.0.4280.66,2020-12-19,com.android.chrome


In [19]:
data,labels=df["Text"].values,df["Star"].astype(int).values
glove_file= './glove.6B.50d.txt'
loader=TextLoader()
#loader.build_vocab(data,50,glove_file=glove_file,write_file="vocab_star.txt",correct=True)
loader.build_vocab(data,50,vocab_file="./vocab_star.txt")


In [12]:
data_=np.array(loader.text)
X_train, X_test, y_train, y_test = train_test_split(data_, labels, test_size=0.20, random_state=42)
train_dataset=MyDataset(X_train,y_train)
val_dataset=MyDataset(X_test,y_test)


In [13]:
model_lstm=BaseModel(vocab_size=len(loader.vectors),embedding_size=50,hidden_size=256)

In [14]:

class Classifier(nn.Module):
  def __init__(self,hidden_size,output_size,model_lstm,lm=False):
    super(Classifier,self).__init__()
    self.lm=lm
    self.rnn=model_lstm
    if lm:
      self.linear=nn.Linear(hidden_size,output_size)
    else:
      self.linear=nn.Sequential(nn.Linear(hidden_size,128),
                                    nn.ReLU(),
                                    nn.Linear(128,output_size))
      
  def forward(self,input):
    last_output=self.rnn(input)
    if self.lm:
      output=self.linear(last_output)
      output=F.log_softmax(output, dim=2)
    else:
      output=self.linear(last_output[:,-1,:])
      output=F.log_softmax(output, dim=1)
    return output
 

#model=Classifier(256,len(loader.vectors),model_lstm,lm=True).to(device)
model=Classifier(256,5,model_lstm).to(device)
lm=False

In [15]:
from tqdm import tqdm

def train_model(train_data,model,optimizer,batch_size,bptt,lm=False,max_len=100):
  loss,count=0,0
  criterion=nn.NLLLoss()
  model.train()
  for data,target in tqdm(DataLoader(train_data,batch_size=batch_size)):
    if lm:
      data,target=loader.postprocess(batch=data,lm=lm,max_len=max_len)
    else:
      data=loader.postprocess(batch=data,max_len=max_len)
    data_chunk=data.split(bptt,dim=1)
    target_chunk=target.split(bptt,dim=1) if lm else target.unsqueeze(1).repeat(1,bptt).split(1,dim=1)
    bptt_loss= 0
    count+=1
    model.rnn.rnn_state=None
    for data,target in zip(data_chunk,target_chunk):
      data=data.to(device)
      target=target.to(device)
      model.zero_grad()
      predictions = model(data)
      loss = criterion(predictions.permute(0,2,1), target) if lm else criterion(predictions, target.squeeze(1)-1)
      loss.backward()
      optimizer.step()
      model.rnn.repackage_rnn_state()
      bptt_loss += loss.item()
     
    loss+=bptt_loss/bptt
  return loss/count

def eval_model(val_data,model,batch_size,lm=False,max_len=100):
  model.eval()
  epoch_acc,count=0.0,0
  with torch.no_grad():
    for data_,target in tqdm(DataLoader(val_data,batch_size=batch_size)):
      model.rnn.rnn_state=None
      if lm:
        data,target=loader.postprocess(batch=data_,lm=lm,max_len=max_len)
      else:
        data=loader.postprocess(batch=data_,max_len=max_len)

      data=data.to(device) 
      target=target.to(device)
      predictions = model(data)

      acc = accuracy(predictions, target,lm) 
      epoch_acc+=acc
      count+=1
    epoch_acc = epoch_acc/count
    
  return epoch_acc

optimizer=torch.optim.Adam(model.parameters(),lr=0.001)
criterion=nn.CrossEntropyLoss()
for epoch in range(6):
  train_loss=train_model(train_dataset,model,optimizer,16,20,max_len=40,lm=lm)
  val_acc=eval_model(val_dataset,model,16,max_len=40,lm=lm)
  print("train_loss : {}    accuracy : {} ".format(train_loss,val_acc))

100%|██████████| 356/356 [00:08<00:00, 44.12it/s]
100%|██████████| 89/89 [00:00<00:00, 552.93it/s]
  1%|▏         | 5/356 [00:00<00:07, 45.07it/s]

train_loss : 0.0021446768660098314    accuracy : 0.6978330612182617 


100%|██████████| 356/356 [00:08<00:00, 43.83it/s]
100%|██████████| 89/89 [00:00<00:00, 543.72it/s]
  1%|▏         | 5/356 [00:00<00:07, 44.56it/s]

train_loss : 0.0027653651777654886    accuracy : 0.7083668112754822 


100%|██████████| 356/356 [00:08<00:00, 43.76it/s]
100%|██████████| 89/89 [00:00<00:00, 522.73it/s]
  1%|▏         | 5/356 [00:00<00:08, 42.37it/s]

train_loss : 0.0023569592740386724    accuracy : 0.7139847874641418 


100%|██████████| 356/356 [00:08<00:00, 43.72it/s]
100%|██████████| 89/89 [00:00<00:00, 556.86it/s]
  1%|▏         | 5/356 [00:00<00:08, 43.64it/s]

train_loss : 0.0021452002692967653    accuracy : 0.7196027040481567 


100%|██████████| 356/356 [00:08<00:00, 44.16it/s]
100%|██████████| 89/89 [00:00<00:00, 546.89it/s]
  1%|▏         | 5/356 [00:00<00:07, 45.11it/s]

train_loss : 0.0021419362165033817    accuracy : 0.7231139540672302 


100%|██████████| 356/356 [00:08<00:00, 43.88it/s]
100%|██████████| 89/89 [00:00<00:00, 545.35it/s]

train_loss : 0.00243746442720294    accuracy : 0.7181982398033142 



