# Lab 02

## Google Drive Access Setup

In [0]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Word2Vec

In [2]:
import nltk
nltk.download('punkt')

import pprint
import re
from lxml import etree
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

from gensim.models import Word2Vec

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Downloading TED Scripts from Google Drive 
Click on left side "Files" tab and see the file is downloaded successfully.

In [0]:
id = '1B47OiEiG2Lo1jUY6hy_zMmHBxfKQuJ8-'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('ted_en-20160408.xml')  

### Data Preprocessing

In [6]:
targetXML=open('ted_en-20160408.xml', 'r', encoding='UTF8')

# Getting contents of <content> tag from the xml file
target_text = etree.parse(targetXML)
parse_text = '\n'.join(target_text.xpath('//content/text()'))

# Removing "Sound-effect labels" using regular expression (i.e. (Audio), (Laughter))
content_text = re.sub(r'\([^)]*\)', '', parse_text)

# Tokenising the sentence to process it by using NLTK library
sent_text=sent_tokenize(content_text)

# Removing punctuations and changing all characters to lower case
normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

# Tokenising each sentence to process individual word
sentences=[]
sentences=[word_tokenize(sentence) for sentence in normalized_text]

# Prints only 10 (tokenised) sentences
print(sentences[:10])

[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation'], ['both', 'are', 'necessary', 'but', 'it', 'can', 'be', 'too', 'much', 'of', 'a', 'good', 'thing'], ['consider', 'facit'], ['i', 'm', 'actually', 'old', 'enough', 'to', 'remember', 'them'], ['facit', 'was', 'a', 'fantastic', 'company'], ['they', 'were', 'born', 'deep', 'in', 'the', 'swedish', 'forest', 'and', 'they', 'made', 'the', 'best', 'mechanical', 'calculators', 'in', 'the', 'world'], ['everybody', 'used', 'them'], ['and', 'what', 'did', 'facit', 'do', 'when', 'the', 'electronic', 'calculator', 'came', 'along'], ['they', 'continued', 'doing', 'exactly', 'the', 'same']]


### Word2Vec - Continuous Bag-Of-Words (CBOW)

In [0]:
wv_cbow_model = Word2Vec(sentences=sentences, size=100, window=5, min_count=5, workers=4, sg=0)

In [8]:
similar_words=wv_cbow_model.wv.most_similar("man")
pprint.pprint(similar_words)

[('woman', 0.844805121421814),
 ('guy', 0.8092465400695801),
 ('boy', 0.7867581844329834),
 ('lady', 0.7734391689300537),
 ('soldier', 0.7707762122154236),
 ('gentleman', 0.7431775331497192),
 ('girl', 0.7310270667076111),
 ('kid', 0.7269785404205322),
 ('surgeon', 0.6918785572052002),
 ('writer', 0.6819698810577393)]


### Word2Vec - Skip Gram

In [0]:
wv_sg_model = Word2Vec(sentences=sentences, size=100, window=5, min_count=5, workers=4, sg=1)

In [10]:
similar_words=wv_sg_model.wv.most_similar("man")
pprint.pprint(similar_words)

[('woman', 0.7658183574676514),
 ('guy', 0.7475895285606384),
 ('boy', 0.7224724292755127),
 ('soldier', 0.7056839466094971),
 ('pianist', 0.6882875561714172),
 ('psychiatrist', 0.683869481086731),
 ('jr', 0.6833134293556213),
 ('adage', 0.6786198616027832),
 ('girl', 0.677717924118042),
 ('son', 0.6774390935897827)]


## Word2Vec vs FastText

Word2Vec - Skipgram cannot find similar word "electrofishing" as "electrofishing" is not in the vocabulary.

In [11]:
similar_words=wv_sg_model.wv.most_similar("electrofishing")
pprint.pprint(similar_words)

KeyError: ignored

### FastText - Skip Gram

In [0]:
from gensim.models import FastText

In [0]:
ft_sg_model = FastText(sentences, size=100, window=5, min_count=5, workers=4, sg=1)

In [0]:
result=ft_sg_model.wv.most_similar("electrofishing")
pprint.pprint(result)

### FastText - Continuous Bag-Of-Words (CBOW)

In [0]:
ft_cbow_model = FastText(sentences, size=100, window=5, min_count=5, workers=4, sg=0)

In [0]:
result=ft_cbow_model.wv.most_similar("electrofishing")
pprint.pprint(result)

## King - Man + Woman = ?

Try both CBOW and Skip Gram model to calculate "King - Man + Woman = ?"

In [0]:
result = wv_cbow_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

In [0]:
result = wv_sg_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)


In [0]:
result = ft_cbow_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)


In [0]:
result = ft_sg_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)


This is not what we expected...Probably not enough data to answer as "Queen"

Let's  try with bigger sized data (Google has already trained Word2Vec with Google News data)


### Google's Pretrained Word2Vec (Google News)
[Link to Project](https://code.google.com/archive/p/word2vec/)


In [0]:
# Beware, this file is big (3.39GB) 
id2 = '1cOEYOQRd1VXi7ROShhqZbioCcePvgnR5'
downloaded = drive.CreateFile({'id':id2}) 
downloaded.GetContentFile('GoogleNews-vectors-negative300.bin')  

In [0]:
from gensim.models import KeyedVectors
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

filename = 'GoogleNews-vectors-negative300.bin'
gn_wv_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [0]:
result = gn_wv_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

# Exercise
**You need to implement a Lab2 chatbot that returns a duckduckgo search url with the similar words (retrieved by Google's Word2Vec)**
**(i.e. https://duckduckgo.com/?q=sydney+hotels+Hotel+motel+boutique_hotel)**


1.  Chatbot should start with saying "Welcome. Where would you like to search?"
2.  When search term (search term must be only one word) is entered by user, chatbot should return a search url with top 4 similar words (retrieved by Google's Word2Vec)
3.  After showing a search url, chatbot should ask whether user is satisfied with the result, "Are you satisfied with this result?"
4.  If user says "yes", then system says "Thank you! See you again" and system should be shut down.
5.  If user says "no", then system ask "Where would you like to search?" and goes back to process 2






## Example Communication (Between Lab2 Chatbot and you) 


```
Chatbot: Welcome. What would you like to search?
You: hotel
Chatbot: https://duckduckgo.com/?q=hotels+Hotel+motel+boutique_hotel
Chatbot: Are you satisfied with the result?
You: no
Chatbot: What would you like to search?
You: hospital
Chatbot: https://duckduckgo.com/?q=Hospital+hopsital+hosptial+hospitals
Chatbot: Are you satisfied with the result?
You: yes
Chatbot: Thank you! See you again.
```





Useful information: [API for Gensim Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html)



In [0]:
# You should submit "ipynb" file (You can download it from "File" > "Download .ipynb") to Canvas


def response(word):
  rob_reponse='Chatbot: https://duckduckgo.com/?q='
  rob_reply='Chatbot: Are you satisfied with the result?'
  vector=gn_wv_model(word)
  result = gn_wv_model.similar_by_vector(vector, topn=4, restrict_vocab=None)
  rob_respons=robo_response.join('+'.join(result))
  return ({}\n{}).format(rob_response,robo_reply)

flag=True
print('Chatbot: Welcome. What would you like to search?')
while(flag==True):
  user_response=input()
  if user_response!='no':
    if user_reponse=='yes':
      flag=False
      print('Chatbot: Thank you! See you again')
  else:
    flag=False
    print('Chatbot: what would you like to search?')
    print(response(user_response))
    
    
  
  
  
  
  




# Extension

## Word2Vec with Tensorflow

If you want to implement Word2Vec with Tensorflow, here is a [sample code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/word2vec/word2vec_basic.py) by [tensorflow](https://github.com/tensorflow).  

## Word Embedding Visual Inspector (WEVI)
If you would like to visualise how Word2Vec is learning, the following link is useful https://ronxin.github.io/wevi/