StudentID: 23220020

In [None]:
import os
import gensim
import string
import nltk
import contractions
import unicodedata

## <font color="blue"> Data Collection </font>

Collect a large and diverse textual dataset suitable for training word embeddings. Ensure that the dataset is preprocessed: remove special characters, lowercase all words, etc. 

### Text Cleaning
`text cleaning` -> `utf-coversion` -> `case normalisation` -> `contraction expansion` -> `tokenisation` -> `removing punctuations` -> `removing stopwords`


In [None]:
def convert_utf(text):
    text = text.replace('\u2018', "'").replace('\u2019', "'").replace('\u201C', "`").replace('\u201D', "`").replace('\u2013', '-').replace('\u2014', '-')
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore')
    return text.decode('ascii')

def tokenise_sentences(text):
    #convert utf-8 characters to normal characters
    text = convert_utf(text)
    
    #convert to lowercase
    normalized_content = text.lower()
    
    #fix contractions
    expanded_text = contractions.fix(normalized_content)
    
    sentences = nltk.sent_tokenize(expanded_text)

    data = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)

        #remove punctuations
        #nltk.download("punkt")
        words_without_punctuations = []
        more_punctuation = ["''","--","``"]
        for x in words:
            if x not in string.punctuation and x not in more_punctuation:
                words_without_punctuations.append(x)

        #remove stopwords
        #nltk.download('stopwords')
        stop_words = set(nltk.corpus.stopwords.words('english'))
        filtered_tokens = []
        for token in words_without_punctuations:
            if token not in stop_words:
                filtered_tokens.append(token)
             

        data.append(filtered_tokens)
    
    return data

## <font color="blue"> Training </font>
-  Use a `Word2Vec` embeddings technique. 
- Utilise Gensim library to assist with the training.
- Save the trained model for future use. 

In [None]:
model = gensim.models.Word2Vec(vector_size=200, min_count=1, sg=0)
model.save("./model_test")

with open("./dickens/924-0.txt", "r", encoding="utf8") as file:    
    content = file.read()

data = tokenise_sentences(content)

model.build_vocab(data, update=False)
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
model.save('./model_test')

with open("./dickens/pg1392.txt", "r", encoding="utf8") as file:    
    content = file.read()

data = tokenise_sentences(content)

model.build_vocab(data, update=True)
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
model.save('./model_test')

with open("./dickens/580-0.txt", "r", encoding="utf8") as file:    
    content = file.read()

data = tokenise_sentences(content)

model.build_vocab(data, update=True)
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
model.save('./model_test')

with open("./dickens/pg1407.txt", "r", encoding="utf8") as file:    
    content = file.read()

data = tokenise_sentences(content)

model.build_vocab(data, update=True)
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
model.save('./model_test')


file_names = ["700-0.txt", "650-0.txt", "pg23344.txt", "98-0.txt", "967-0.txt", "963-0.txt", "914-0.txt", "pg730.txt",
"1289-0.txt", "653-0.txt", "27924-0.txt", "1400-0.txt", "pg676.txt", "766-0.txt", "pg1023.txt", "882-0.txt", "644-0.txt", "pg699.txt", 
"675-0.txt", "807-0.txt", "786-0.txt", "564-0.txt", "pg32241.txt", "678-0.txt", "883-0.txt", "pg19337.txt", "917-0.txt", "1467-0.txt"]
for file_name in file_names:
    with open("./dickens/"+file_name, "r", encoding="utf8") as file:    
        content = file.read()

    data = tokenise_sentences(content)

    model.build_vocab(data, update=True)
    model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
    model.save('./model_test')
    print("Finished training on " + file_name)

print("done!")

## <font color="blue"> Web Application </font>
- Design a simple web interface where a user can input a word. 
- Implement back-end functionality to fetch the opposite of the given word
using the trained embeddings. 
- Return the opposite word to the user on the web interface.
- Use `Flask` library for the web application.

In [None]:
from flask import Flask, request
app = Flask(__name__)

html_form_with_message = '''
<!DOCTYPE html>
<html>
<head>
<title>Antonym App</title>
</head>
<body>
    <h2 style="color:pink">sums</h2>
    <form method="post" action="/">
        <label for="text">your word:</label><br>
        <input type="text" name="my_word"><br><br>
        <input type="submit" value="My Button">
    </form>
    <p>result</p>
</body>
</html>
'''

def my_antonym(target_word):
    return model.wv.most_similar(positive=['woman', target_word], negative=['man']) #[1][2]

@app.route('/', methods=['GET', 'POST'])
def home():
    word_input = ''
    opposite_value = ''
    if request.method == 'POST':
        word_input = request.form['my_word']
        opposite_value = my_antonym(word_input)

    if(len(opposite_value) > 1):
        return html_form_with_message.replace("result", f"the opposite of {word_input} is: {str(opposite_value[0][0])} or {str(opposite_value[1][0])} or {str(opposite_value[2][0])}")

    return html_form_with_message
    

app.run()

## <font color="blue"> References </font>
- [1] https://research.wdss.io/word2vec/#Application
- [2] https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.most_similar.html
