# Prediction Testing

Randomly testing comments. Currently using Dr. Soon Lay Ki's method of mean similarity:

- For a specific word, find the top 5 similar words
- Find the cosine mean of those words
- Find words that are above that mean
- Sum up the feature vectors per word and average

In [1]:
# Import necessary libraries here
import numpy as np
import os
import pickle
import re
import itertools
import warnings
warnings.filterwarnings('ignore')

from gensim.models import Word2Vec as w2v

In [2]:
# Load necessary models
FILE = "C:/Users/MyPC/Desktop/Vegito/W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)

FILE = "C:/Users/MyPC/Desktop/Vegito/ML Models/rf_avg_hybrid.sav"
clf = pickle.load(open(FILE, 'rb'))

In [3]:
# Dr. Soon's idea (Use Top 5 words)
def wordsAverage(word):

    # Pre-initialize an empty numpy array (for speed)
    # 300 is used, as it is the number of vectors in Word2Vec
    avgWordsFeature = np.zeros((300,),dtype="float32")

    # Get words that are similar. This returns tuples in a list
    # Topn refers to the Top N words. 10 is default
    top_n = 5

    # Get words that are similar. This returns tuples in a list
    similar_words = model.most_similar(word, topn=top_n)

    # Calculate the Mean Cosine similarity among words
    mean_cos_distance = np.mean([ cos_distance for word, cos_distance in similar_words ])

    # Get the collected words that are similar above this score. 
    # Get the number of words as well
    words_above_mean = [word for word, cos_distance in similar_words if cos_distance > mean_cos_distance]
    total_words = float(len(words_above_mean))

    # Loop over each word
    for word in words_above_mean:

        # Add the word's vector
        avgWordsFeature = np.add(avgWordsFeature,model[word])

    # Average them out
    avgWordsFeature = np.divide(avgWordsFeature,total_words)

    # Return them
    return avgWordsFeature

In [4]:
# Function to transform the data
def makeFeatureVec(words, num_features):

    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")

    # Count number of words
    nwords = 0.

    # Loop over word by word
    # If in vocabulary, add its feature vector to the total
    for word in words.split():
        
        if word in model: #and word not in stop_words:
            nwords += 1.
            avgWordFeature = wordsAverage(word)
            featureVec = np.add(featureVec, avgWordFeature)

    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    
    # If number of words zero
    if nwords == 0:
        featureVec = characterVec(words, model, num_features)
    
    return featureVec

In [5]:
# Cleaning the data. Use the same approach when cleaning reddit data
def cleaningSentence(sentence):

    #Remove URLs
    clean_sentence = re.sub(r'\w+:\/\/\S+', ' ', sentence)

    # Word Standardizing (Ex. Looooolll should be Looll)
    clean_sentence = ''.join(''.join(s)[:2] for _, s in itertools.groupby(clean_sentence))

    #Convert words to lower case and split them
    words = clean_sentence.lower().split()

    #Remove contractions by expansion of words
    #words = [contractions[word] if word in contractions else word for word in words]

    # Rejoin words 
    words = " ".join(words)

    # Remove non-alphabets
    words = re.sub("[^a-z\s]", " ", words)

    return words

In [None]:
sentence = input("Enter sentence to classify: ")
print("\n\n")
# Clean the sentence
sentence = cleaningSentence(sentence)

# Transform the sentence
sentence = makeFeatureVec(sentence, 300)

# Predict using Machine Learning (Use Random Forest)

output = clf.predict_proba(sentence)
    
print("CYBERBULLY PROBABILITY: ", output[0][1])