In [1]:
import numpy as np
import pandas as pd
import re 
import nltk
import os

# Part 1 Q1 Import and analyse the data set

In [2]:
data=pd.read_csv("blogtext.csv")
data.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


### Checking for null values in dataset

In [3]:
data.isna().any()


id        False
gender    False
age       False
topic     False
sign      False
date      False
text      False
dtype: bool

#### There are no null values in the dataset

In [4]:
data.shape


(681284, 7)

#### The dataset is very large, we can take fewer samples as our machine won't be able to handle this huge data.

In [5]:
data=data.head(10000)


In [6]:
data.shape

(10000, 7)

In [7]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  int64 
 1   gender  10000 non-null  object
 2   age     10000 non-null  int64 
 3   topic   10000 non-null  object
 4   sign    10000 non-null  object
 5   date    10000 non-null  object
 6   text    10000 non-null  object
dtypes: int64(2), object(5)
memory usage: 547.0+ KB


#### We can drop columns like id, and date as they won't help in model building. 

In [8]:
data.drop(['id','date'], axis=1, inplace=True)


In [9]:
data.head(10)

Unnamed: 0,gender,age,topic,sign,text
0,male,15,Student,Leo,"Info has been found (+/- 100 pages,..."
1,male,15,Student,Leo,These are the team members: Drewe...
2,male,15,Student,Leo,In het kader van kernfusie op aarde...
3,male,15,Student,Leo,testing!!! testing!!!
4,male,33,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...
5,male,33,InvestmentBanking,Aquarius,I had an interesting conversation...
6,male,33,InvestmentBanking,Aquarius,Somehow Coca-Cola has a way of su...
7,male,33,InvestmentBanking,Aquarius,"If anything, Korea is a country o..."
8,male,33,InvestmentBanking,Aquarius,Take a read of this news article ...
9,male,33,InvestmentBanking,Aquarius,I surf the English news sites a l...


In [10]:
data.dtypes

gender    object
age        int64
topic     object
sign      object
text      object
dtype: object

#### All the data types are object, except for age. We can change it to match the others.

In [11]:
data['age'] = data['age'].astype('object')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  10000 non-null  object
 1   age     10000 non-null  object
 2   topic   10000 non-null  object
 3   sign    10000 non-null  object
 4   text    10000 non-null  object
dtypes: object(5)
memory usage: 390.8+ KB


# Part 1 Q2 Perform data pre-processing on the data:

## •Data cleansing by removing unwanted characters, spaces, stop words etc.

In [13]:
data['text']=data['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))

#### We have removed all the unwanted characters

In [14]:
data['text']=data['text'].apply(lambda x: x.lower())

#### We have converted all text to lower case

In [15]:
data['text']=data['text'].apply(lambda x: x.strip())

#### We have removed all the unwanted spaces

In [16]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

In [17]:
data['text']=data['text'].apply(lambda x: ' '.join([words for words in x.split() if words not in stopwords]))

#### We have removed all the stopwords

# Part 1 Q3 Design, train, tune and test the best text classifier.

#### To design the model, first we have to merge all the labels into one as we are dealing with a multilabel problem.

In [18]:
data['labels']=data.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1)

In [19]:
data.head()

Unnamed: 0,gender,age,topic,sign,text,labels
0,male,15,Student,Leo,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,male,15,Student,Leo,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,male,15,Student,Leo,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,male,15,Student,Leo,testing testing,"[male, 15, Student, Leo]"
4,male,33,InvestmentBanking,Aquarius,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


#### We have created a new column 'labels' where we have merged all the labels. We can now remove gender, age, topic, and sign columns. 

In [20]:
data=data[['text','labels']]


In [21]:
data.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


### We now have the data ready to build the model. We will now split the data into X, and Y. 

In [22]:
X=data['text']
y=data['labels']


In [23]:
from sklearn.feature_extraction.text import CountVectorizer


In [24]:
vectorizer=CountVectorizer(binary=True, ngram_range=(1,2))


In [25]:
X=vectorizer.fit_transform(X)


In [26]:
X[1]

<1x643302 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [27]:
vectorizer.get_feature_names()[:10]


['aa',
 'aa amazing',
 'aa anger',
 'aa compared',
 'aa keeps',
 'aa nice',
 'aa sd',
 'aaa',
 'aaa come',
 'aaa discount']

In [28]:
label_counts=dict()

for labels in data.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
        else:
            label_counts[label]=1

In [29]:
label_counts

{'male': 5916,
 '15': 602,
 'Student': 1137,
 'Leo': 301,
 '33': 136,
 'InvestmentBanking': 70,
 'Aquarius': 571,
 'female': 4084,
 '14': 212,
 'indUnk': 3287,
 'Aries': 4198,
 '25': 386,
 'Capricorn': 215,
 '17': 1185,
 'Gemini': 150,
 '23': 253,
 'Non-Profit': 71,
 'Cancer': 504,
 'Banking': 16,
 '37': 33,
 'Sagittarius': 1097,
 '26': 234,
 '24': 655,
 'Scorpio': 971,
 '27': 1054,
 'Education': 270,
 '45': 16,
 'Engineering': 127,
 'Libra': 491,
 'Science': 63,
 '34': 553,
 '41': 20,
 'Communications-Media': 99,
 'BusinessServices': 91,
 'Sports-Recreation': 80,
 'Virgo': 236,
 'Taurus': 812,
 'Arts': 45,
 'Pisces': 454,
 '44': 3,
 '16': 440,
 'Internet': 118,
 'Museums-Libraries': 17,
 'Accounting': 4,
 '39': 79,
 '35': 2315,
 'Technology': 2654,
 '36': 1708,
 'Law': 11,
 '46': 7,
 'Consulting': 21,
 'Automotive': 14,
 '42': 14,
 'Religion': 9,
 '13': 42,
 'Fashion': 1622,
 '38': 46,
 '43': 6,
 'Publishing': 4,
 '40': 1,
 'Marketing': 156,
 'LawEnforcement-Security': 10,
 'HumanReso

### We need multibinarizer for this problem. 

In [30]:
from sklearn.preprocessing import MultiLabelBinarizer
binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))

In [31]:
Y=binarizer.fit_transform(data.labels)


### Splitting the data into training and testing

In [32]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2)


In [33]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

### Creating LogisticsRegression classifier model

In [34]:
model=LogisticRegression(solver='lbfgs')
model=OneVsRestClassifier(model)

In [35]:
model.fit(Xtrain,Ytrain)


OneVsRestClassifier(estimator=LogisticRegression())

In [36]:
Ypred=model.predict(Xtest)


In [37]:
Ypred_inversed = binarizer.inverse_transform(Ypred)
y_test_inversed = binarizer.inverse_transform(Ytest)

In [38]:
for i in range(5):
    print('Text:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        Xtest[i],
        ','.join(y_test_inversed[i]),
        ','.join(Ypred_inversed[i])
    ))

Text:	  (0, 203034)	1
  (0, 593438)	1
  (0, 312628)	1
  (0, 22273)	1
  (0, 100151)	1
  (0, 3589)	1
  (0, 529247)	1
  (0, 170072)	1
  (0, 440127)	1
  (0, 225036)	1
  (0, 504728)	1
  (0, 363386)	1
  (0, 225428)	1
  (0, 118891)	1
  (0, 616967)	1
  (0, 425391)	1
  (0, 580466)	1
  (0, 204735)	1
  (0, 258888)	1
  (0, 497999)	1
  (0, 317556)	1
  (0, 2199)	1
  (0, 115186)	1
  (0, 302431)	1
  (0, 559031)	1
  :	:
  (0, 559337)	1
  (0, 529668)	1
  (0, 272374)	1
  (0, 107826)	1
  (0, 374610)	1
  (0, 505013)	1
  (0, 580687)	1
  (0, 124834)	1
  (0, 80361)	1
  (0, 23286)	1
  (0, 68583)	1
  (0, 521263)	1
  (0, 512856)	1
  (0, 313172)	1
  (0, 109603)	1
  (0, 34907)	1
  (0, 455571)	1
  (0, 259165)	1
  (0, 205082)	1
  (0, 578939)	1
  (0, 145598)	1
  (0, 504941)	1
  (0, 436363)	1
  (0, 22399)	1
  (0, 543981)	1
True labels:	27,Education,Virgo,male
Predicted labels:	male


Text:	  (0, 555727)	1
  (0, 54084)	1
  (0, 570626)	1
  (0, 571580)	1
  (0, 555751)	1
True labels:	26,Gemini,indUnk,male
Predicted labels

### Building a function to show accuracy score, f1 score, aps, and recall

In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(Ytest, Ypred):
    print('Accuracy score: ', accuracy_score(Ytest, Ypred))
    print('F1 score: ', f1_score(Ytest, Ypred, average='micro'))
    print('Average precision score: ', average_precision_score(Ytest, Ypred, average='micro'))
    print('Average recall score: ', recall_score(Ytest, Ypred, average='micro'))

In [40]:
print_evaluation_scores(Ytest, Ypred)


Accuracy score:  0.3175
F1 score:  0.6347557763227214
Average precision score:  0.4499984410464346
Average recall score:  0.527125


In [41]:
print ('X'*1000)

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

# PART 2

#### Importing necessary libraries

In [56]:
import numpy
import numpy as np
import pandas as pd
import random
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.chat.util import Chat, reflections
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.optimizers import SGD
from nltk import word_tokenize


## Importing corpus file

In [43]:
import json
with open ('GL+Bot.json') as file:
    corpus = json.load(file)

In [44]:
print(corpus)

{'intents': [{'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}, {'tag': 'Exit', 'patterns': ['thank you', 'thanks', 'cya', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy'], 'responses': ['I hope I was able to assist you, Good Bye'], 'context_set': ''}, {'tag': 'Olympus', 'patterns': ['olympus', 'explain me how olympus works', 'I am not able to understand olympus', 'olympus window not working', 'no access to olympus', 'unable to see link in olympus', 'no link visible on olympus', 'whom to contact for olympus', 'lot of p

### Tokenisation

In [45]:
words=[]
labels=[]
doc_x=[]
doc_y=[]

for intent in corpus['intents']:
    for pattern in intent['patterns']:
        w_temp = nltk.word_tokenize(pattern)
        words.extend(w_temp)
        doc_x.append(w_temp)
        doc_y.append(intent['tag'])
    
    if intent['tag'] not in labels:
        labels.append(intent['tag'])


### Stemming

In [46]:
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word.lower()) for word in words if word != '?']
words = sorted(list(set(words)))
labels = sorted(labels)

#[lemmatizer.lemmatize(word) for word in words if word not

In [57]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
CountVec = CountVectorizer(ngram_range=(1,1),
                           stop_words='english')

Count_data = CountVec.fit_transform(words)
 
cv_dataframe=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names())
print(cv_dataframe)

#bag_of_words = vec.transform(corpus)

     able  access  activation  ada  adam  aifl  aiml  ann  artificial  \
0       0       0           0    0     0     0     0    0           0   
1       1       0           0    0     0     0     0    0           0   
2       0       1           0    0     0     0     0    0           0   
3       0       0           1    0     0     0     0    0           0   
4       0       0           0    1     0     0     0    0           0   
..    ...     ...         ...  ...   ...   ...   ...  ...         ...   
153     0       0           0    0     0     0     0    0           0   
154     0       0           0    0     0     0     0    0           0   
155     0       0           0    0     0     0     0    0           0   
156     0       0           0    0     0     0     0    0           0   
157     0       0           0    0     0     0     0    0           0   

     backward  ...  validation  visible  wasted  weight  whats  window  work  \
0           0  ...           0        0    

### Defingin bag_of_words 

In [59]:
def bag_of_words(s,w):
    bag = [0 for _ in range(len(words))]
    
    s_words = nltk.word_tokenize(s)
    s_words = [lemmatizer.lemmatize(word.lower()) for word in s_words]
    
    for se in s_words:
        for i,w in enumerate (words):
            if w ==se:
                bag[i]=1
    
    x3 = numpy.array(bag)
    x3 = x3.reshape(1,158)
    return numpy.array(x3)

## Defining training and target variables

In [60]:
train = []
target = []
out_empty = [0 for _ in range(len(labels))]

for x, doc in enumerate (doc_x):
    bag=[]
    
    w_temp = [lemmatizer.lemmatize(word.lower()) for word in doc]
    
    for w in words:
        if w in w_temp:
            bag.append(1)
        else:
            bag.append(0)
    
    output_row = out_empty[:]
    output_row[labels.index(doc_y[x])] = 1
    
    train.append(bag)
    target.append(output_row)

### Building sequential model 

In [61]:
model = Sequential()
model.add(Dense(64, input_dim = len(train[0]), activation = 'relu' ))
model.add(Dropout(0.5))
model.add(Dense(32, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation = 'softmax'))

### Building Stochastic Gradient Descent function

In [62]:
sgd = SGD(lr = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)


  super(SGD, self).__init__(name, **kwargs)


### Building a compiler

In [63]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])

### Fitting the data into the model, running with 200 epochs

In [64]:
model.fit(np.array(train),np.array(target), epochs = 200, batch_size = 1, verbose = 1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7fd14d814070>

### Defining Chat function, that would initiate the chatbot

In [65]:
def chat():
    print("Chat with ycbot (type bye to quit)")
    print("If answer is not right (type:*)")
    while True:
        inp = input("\n\nYou:")
        if inp.lower()=="*":
            print("ycbot: Please rephrase your question")
        if inp.lower()=="bye":
            print('Have a great learning')
            break
            
        results = model.predict([bag_of_words (inp,words)]) #THIS IS THE PROBLEM LINE 
        result_index = numpy.argmax(results)
        tag = labels[result_index]
        
        for tg in corpus['intents']:
            if tg['tag']==tag:
                responses =tg['responses']
        print (random.choice(responses))

### Using the Chat function, asking several questions and ending the chat with 'bye'

In [55]:
chat()

Chat with ycbot (type bye to quit)
If answer is not right (type:*)




You: hello


Hello! how can i help you ?




You: I need help with olympus


Please raise a ticket and we'll get back to you https://olympus1.mygreatlearning.com/program_support?pb_id=8058




You: Please explain Naive Bayes theorem


I hope this link will help you : https://en.wikipedia.org/wiki/Machine_learning




You: knn help


I hope this link will help you : https://en.wikipedia.org/wiki/Machine_learning




You: what is deep learning


Link: https://en.wikipedia.org/wiki/Neural_network




You: explain softmax


Link: https://en.wikipedia.org/wiki/Neural_network




You: what is your name


I am your virtual learning assistant created by Yashank Chopra




You: what the hell


Please use respectful words




You: my problem is not solved


Tarnsferring the request to your PM




You: you helped me


I hope I was able to assist you, Good Bye




You: bye


Have a great learning
