# Data Exploration

In [2]:
import numpy as np
from collections import defaultdict
import pandas as pd
import torch 

In [15]:
data_df = pd.read_csv('covid_19.csv',names=["questions","intent"])
print(data_df.shape)
data_df.head(20)


(90, 2)


Unnamed: 0,questions,intent
0,Hello,greetings
1,good morning,greetings
2,good evening,greetings
3,good night,greetings
4,good afternoon,greetings
5,how are you,greetings
6,how is going,greetings
7,Hi,greetings
8,Hey,greetings
9,"hello, how are you?",greetings


In [7]:
response_df = pd.read_csv('response.csv',names=["intent","response"])
response_df

Unnamed: 0,intent,response
0,intent,response
1,greetings,"Hi, I'm robot RunMin, how can I help you?"
2,information,Coronavirus disease (COVID-19) is an infectiou...
3,symptoms,COVID-19 affects different people in different...
4,prevention,"Clean your hands often. Use soap and water, or..."
5,vaccine,Residents who are eligible can book an appoint...
6,travel,Mandatory qurantine is required when enter Can...
7,default,"Sorry, I don't understand your question, coul..."


# Get word vectors

In [34]:
from sentence_transformers  import SentenceTransformer
model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

X_train = model.encode(data_df.questions)
X_train

array([[-0.03588703,  0.07340592,  2.16168673, ...,  0.19535828,
        -0.05371311,  0.38527844],
       [ 0.02355501,  0.11949148,  2.03480946, ...,  0.38789024,
         0.19235491, -0.25817466],
       [-0.12137323, -0.56769227,  2.42437684, ...,  0.66499331,
        -0.30690484, -0.34701315],
       ...,
       [ 0.189301  , -0.61588375,  0.84411025, ..., -0.62423787,
        -1.21815621, -0.45374108],
       [-0.0955302 , -0.31790808,  2.00628787, ..., -0.06275352,
        -0.62710438, -0.50585847],
       [ 0.1626743 , -0.16846646,  1.59673003, ...,  0.22541887,
        -0.42336589, -0.03705003]])

In [21]:
X_train.shape

(90, 768)

In [23]:
X_train

array([[-0.03558077,  0.0735217 ,  2.1618361 , ...,  0.19558029,
        -0.05353707,  0.38546947],
       [ 0.02369248,  0.11970411,  2.0344949 , ...,  0.38818726,
         0.19264594, -0.25823656],
       [-0.12126927, -0.56770873,  2.423536  , ...,  0.6652312 ,
        -0.30706912, -0.34679425],
       ...,
       [ 0.18953644, -0.6160029 ,  0.84422123, ..., -0.6242967 ,
        -1.2185243 , -0.45368654],
       [-0.0955678 , -0.31779298,  2.0061483 , ..., -0.0627726 ,
        -0.62717813, -0.50611836],
       [ 0.16311412, -0.1682854 ,  1.5968355 , ...,  0.22600546,
        -0.42258802, -0.03637498]], dtype=float32)

# Data preprocessing 


In [24]:
responses_dict = response_df.set_index('intent').T.to_dict('list')
responses_dict

{'intent': ['response'],
 'greetings': ["Hi, I'm robot RunMin, how can I help you?"],
 'information': ['Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus.Most people infected with the COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment.  Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.'],
 'symptoms': ['COVID-19 affects different people in different ways. Most infected people will develop mild to moderate illness and recover without hospitalization. Most common symptoms: fever, dry cough,tiredness. Less common symtoms: aches and pains, sore throat,diarrhoea,conjunctivitis,headache, loss of taste or smell,a rash on skin, or discolouration of fingers or toes.'],
 'prevention': ["Clean your hands often. Use soap and water, or an alcohol-based hand rub

In [30]:
trainining_sentences =data_df.questions
#conver the intent to one hot code
training_intents =  pd.get_dummies(data_df.intent)

y_labels = []

intent2label = {'greetings':0,'information':1,'prevention':2,
                'symptoms':3, 'travel':4,'vaccine' :5}

label_tensors = torch.zeros(len(trainining_sentences),1)

for intent in intent2label.keys():
    l = training_intents.loc[training_intents[intent] == 1].index.tolist()
    for idx in l :
        label_tensors[idx][0]=intent2label[intent]

# ===============================dataloader =====================

from torch.utils.data import DataLoader,Dataset
import torch.nn as nn

dataset = Dataset()


class MyDataset(Dataset):
    def __init__(self,X,y):
        X = X_train
        y = y_train
        self.X = X
        self.y = y
    def __getitem__(self, idx):
        return self.X[idx],self.y[idx]
    def __len__(self):
        return self.X.shape[0]
dataset  = MyDataset(X_train,y_train)
dataloader = DataLoader(dataset = dataset, shuffle = True, batch_size = 1 )
dataset[0]

(tensor([-3.5581e-02,  7.3522e-02,  2.1618e+00,  1.7217e-01,  1.8279e-01,
          4.0098e-01,  1.1250e-01,  9.7601e-01, -2.6738e-01,  6.3053e-02,
         -1.0020e+00,  2.8096e-01,  6.4849e-02,  5.8062e-01,  8.7697e-01,
          4.4823e-02, -7.0718e-01, -3.4456e-01,  2.5685e-01, -3.4730e-01,
         -7.1517e-01,  1.9162e-01, -6.9504e-02, -1.2401e+00, -1.2057e-01,
         -9.7718e-01,  3.1601e-01, -1.5521e+00, -2.1877e-01, -3.7430e-03,
         -1.3956e-01, -4.7111e-01,  1.0959e+00, -2.3510e-01, -5.3367e-01,
          4.9319e-01, -9.1725e-01,  3.7771e-01,  2.0662e-01, -4.0706e-01,
          1.6553e+00, -2.5628e-02,  1.3582e+00,  5.6206e-01, -5.9791e-01,
         -2.7103e-02, -5.0779e-01,  4.4849e-01, -2.5367e-01, -1.1402e+00,
         -1.1033e+00, -7.4957e-01,  1.0923e-01,  5.1481e-02, -1.0791e-01,
         -4.3790e-01,  2.4728e-01, -6.1801e-01,  3.5473e-01,  8.0062e-01,
          8.8734e-01, -7.5372e-01,  7.1330e-01,  7.4087e-01, -5.4093e-01,
          6.0985e-01,  5.9514e-01, -1.

# Modeling

In [35]:

torch.set_default_tensor_type(torch.DoubleTensor)

NUM_EPOCHS = 300

fc=torch.nn.Linear(768,6) #只使用一层线性分类器

fc = fc.double()

criterion=torch.nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(fc.parameters())

for epoch in range(NUM_EPOCHS):
    for idx, (images,labels) in enumerate(dataloader):
        x =images.reshape(-1,768)
        x = x.to(torch.double)
        labels = labels.squeeze(1).long()
        optimizer.zero_grad() #梯度清零
        preds=fc(x) #计算预测
        loss=criterion(preds,labels) #计算损失
        loss.backward() # 计算参数梯度
        optimizer.step() # 更新迭代梯度
        if epoch % 50 ==0:
            if idx % 20 ==0:
                print('epoch={}:idx={},loss={:g}'.format(epoch,idx,loss))

correct=0
total=0

for idx,(images,labels) in enumerate(dataloader):
    x =images.reshape(-1,768)
    x = x.to(torch.double)
    preds=fc(x)
    predicted=torch.argmax(preds,dim=1) #在dim=1中选取max值的索引
    if idx ==0:
        print('x size:{}'.format(x.size()))
        print('preds size:{}'.format(preds.size()))
        print('predicted size:{}'.format(predicted.size()))

    total+=labels.size(0)
    correct+=(predicted == labels).sum().item()

accuracy=correct/total
print('acc : {:1%}'.format(accuracy))

torch.save(fc.state_dict(),'classify_model1.pth')


epoch=0:idx=0,loss=1.94347
epoch=0:idx=20,loss=0.770738
epoch=0:idx=40,loss=1.13474
epoch=0:idx=60,loss=0.957802
epoch=0:idx=80,loss=0.733049
epoch=50:idx=0,loss=0.000515386
epoch=50:idx=20,loss=0.00272484
epoch=50:idx=40,loss=0.000705824
epoch=50:idx=60,loss=0.00345775
epoch=50:idx=80,loss=2.12678e-05
epoch=100:idx=0,loss=8.97797e-06
epoch=100:idx=20,loss=3.7489e-05
epoch=100:idx=40,loss=1.16102e-05
epoch=100:idx=60,loss=5.87562e-06
epoch=100:idx=80,loss=0.000105546
epoch=150:idx=0,loss=7.57253e-05
epoch=150:idx=20,loss=1.7776e-06
epoch=150:idx=40,loss=3.31441e-05
epoch=150:idx=60,loss=1.7585e-05
epoch=150:idx=80,loss=3.60519e-07
epoch=200:idx=0,loss=2.77741e-06
epoch=200:idx=20,loss=7.9143e-06
epoch=200:idx=40,loss=3.45831e-06
epoch=200:idx=60,loss=1.21873e-08
epoch=200:idx=80,loss=4.17052e-06
epoch=250:idx=0,loss=4.50137e-09
epoch=250:idx=20,loss=3.7772e-07
epoch=250:idx=40,loss=1.31168e-06
epoch=250:idx=60,loss=1.7556e-07
epoch=250:idx=80,loss=1.33415e-07
x size:torch.Size([1, 768]