In [1]:
# Load Word2Vec model
from gensim.models import Word2Vec as w2v

FILE = "C:/Users/MyPC/Desktop/Vegito/W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)



In [2]:
# Load the dataset
import pandas as pd

FILE = "C:/Users/MyPC/Desktop/Vegito/clean_dataset.csv"
df = pd.read_csv(FILE)
len(df)

6580

In [3]:
# Get the unique words in the dataset
unique_words = []

for comment in df['Comment']:
    
    for word in comment.split():
        if word not in unique_words:
            unique_words.append(word)
            
len(unique_words)

19286

In [4]:
# Check how many are not in the reddit model and collect them
not_in_model = []

for word in unique_words:
    
    if word not in model:
        not_in_model.append(word)
        
print("NUMBER OF WORDS NOT IN MODEL: ", len(not_in_model))
print("PERCENTAGE MISSING: ", (len(not_in_model)/len(unique_words)) * 100)

NUMBER OF WORDS NOT IN MODEL:  1141
PERCENTAGE MISSING:  5.916208648760759


In [5]:
# Sort the words then print them. Then, write them in a text file (DONE)
#not_in_model = sorted(not_in_model)
#with open('Missing words.txt', 'w') as fh:
#    for word in not_in_model:
#        fh.write("{}\n".format(word))

In [6]:
# Get the number of words in the Word2Vec model
print(len(model.syn0))

1146604


In [7]:
# Get words that are similar. This returns tuples in a list
word = 'asshole'
top_n = 10

similar_words = model.most_similar(word, topn=top_n)
model.most_similar(word, topn=top_n)

[('arsehole', 0.8830243349075317),
 ('asshat', 0.8711205124855042),
 ('prick', 0.8389500975608826),
 ('dickhead', 0.8373932242393494),
 ('douchebag', 0.8241000175476074),
 ('dickbag', 0.8229827880859375),
 ('ahole', 0.8007912039756775),
 ('idiot', 0.7848369479179382),
 ('jackass', 0.7815600037574768),
 ('douche', 0.7751066088676453)]

In [8]:
# Calculate the Mean Cosine similarity among words
import numpy as np

mean_cos_distance = np.mean([ cos_distance for word, cos_distance in similar_words ])

In [9]:
# Get the collected words that are similar above this score
words_above_mean = [word for word, cos_distance in similar_words if cos_distance > mean_cos_distance]
total_words = float(len(words_above_mean))

In [10]:
words_above_mean

['arsehole', 'asshat', 'prick', 'dickhead', 'douchebag', 'dickbag']

In [11]:
# Pre-initialize an empty numpy array (for speed)
avgWordsFeature = np.zeros((300,),dtype="float32")

In [12]:
# Loop over each word
for word in words_above_mean:

    # Add the word's vector
    avgWordsFeature = np.add(avgWordsFeature,model[word])
    
# Average them out
avgWordsFeature = np.divide(avgWordsFeature,total_words)

In [13]:
# Use a numpy array to store each word as a feature
# Use the first N-words. N is an integer

sentenceWordFeature = np.zeros((5,), dtype="float32")
print(sentenceWordFeature)

[ 0.  0.  0.  0.  0.]


In [14]:
# Test with a single sentence

sentence = "you goddamn bastard"
complete_sentence = 1 # 1 for full, 0 for incomplete

for i,word in enumerate(sentence.split()):
    
    if i == len(sentenceWordFeature):
        complete_sentence = 0
        break
        
    if word in model:
        word_feature = np.mean(model[word])
    else:
        word_feature = -1.0
        
    sentenceWordFeature[i] = word_feature
    
    print(sentenceWordFeature)

[-0.00470508  0.          0.          0.          0.        ]
[-0.00470508  0.00158833  0.          0.          0.        ]
[-0.00470508  0.00158833 -0.00082337  0.          0.        ]


In [15]:
# Soundex Dictionary (Normalized)
soundex_dictionary = { 
    'B': "1", "F": "1", "P": "1",
    "V": "1", "C": "2", "G": "2",
    "J": "2", "K": "2", "Q": "2",
    "S": "2", "X": "2", "Z": "2", 
    "D": "3", "T": "3", "L": "4", 
    "M": "5", "N": "5", "R": "6", 
    "A": ".", "E": ".", "I": ".",
    "O": ".", "U": ".", "Y": "."
}

In [16]:
 def getSoundex(word):
    
    # Uppercase the word
    word = word.upper()

    # Get the first letter of the word
    soundex = word[0]

    # Skip the following letters
    skip_dict = "HW"
    word = [letter for letter in word[1:] if letter not in skip_dict]
    word = "".join(word)

    # Loop character by character (Start with 2nd character)
    for char in word[0:]:

        code = soundex_dictionary[char]

        if code != soundex[-1]:
            soundex += code

    # Replace period characters
    soundex = soundex.replace(".", "")

    # If the string has only one character, append rest with three 0s.
    soundex = soundex[:4].ljust(4, "0")

    return soundex

In [17]:
getSoundex('fucked')

'F230'

In [18]:
# Load Soundex dictionary
import pickle

FILE = "C:/Users/MyPC/Desktop/Vegito/Word Dictionaries/soundex_dict.pk"

soundex_dict = pickle.load(open(FILE,"rb"))

In [19]:
# Perform operations here
print("TOTAL WORDS: %i" % (len(soundex_dict)))

unique_soundex = list(set([value for key, value in soundex_dict.items()]))

print("UNIQUE SOUNDEX VALUES: %i" % (len(unique_soundex)))

print("PERCENTAGE: ", (len(unique_soundex)/len(soundex_dict)) * 100)

TOTAL WORDS: 1146604
UNIQUE SOUNDEX VALUES: 6599
PERCENTAGE:  0.5755256391919092


In [20]:
# Load in missing word text file
FILE = "C:/Users/MyPC/Desktop/Vegito/Text Files/Missing words.txt"
missing_words = []

with open(FILE, 'r') as fh:
    
    # Iterate line by line
    # Remove new line characters
    for line in fh:
        missing_words.append(line.strip())

In [21]:
# Count how many missing words are in the soundex category
count = 0

for word in missing_words:
    
    soundex_encode = getSoundex(word)
    
    if soundex_encode in unique_soundex:
        count += 1
        
print("SIMILAR WORDS FOUND: %i" % (count))

SIMILAR WORDS FOUND: 1141


In [22]:
# Load the transformed file
FILE = "C:/Users/MyPC/Desktop/Vegito/Word Dictionaries/soundex_words_list.pk"

soundex_words_list = pickle.load(open(FILE,"rb"))

soundex = getSoundex('fuckingloser')
word_list = soundex_words_list[soundex]

print(len(word_list))

441


In [23]:
# Find average length of a missing word

word_lengths = [len(word) for word in missing_words]

print("AVERAGE LENGTH: ", sum(word_lengths)/len(missing_words))

AVERAGE LENGTH:  8.55740578439965


In [42]:
# Testing
index = 5

for word in missing_words:
    
    subs_word = word[:index]
    
    for model_word in model.index2word:
        
        if model_word.startswith(subs_word):
            print(subs_word, model_word)
            break

abuse abuse
acidd acidd
ackno acknowledge
adole adolescent
adumb adumb
afroc afrocentric
aidr aidra
ajaja ajaja
ajue ajuever
aktu aktuell
akunt akunt
alber alberta
aldas aldas
alegi alegi
aleja alejandro
alist alistar
aljaz aljazeera
allah allah
allin allin
allra allrandom
almaj almajid
ameli amelia
ameri american
ameri american
ammed ammed
among among
anara anarachy
annag annagudbjorg
annav annaversary
annel anneliese
anoel anoelr
anony anonymous
antag antagonist
anton antonio
antun antunes
apple apple
arbet arbeta
arbit arbitrary
areal areal
argua arguably
aritc aritcle
arrno arrnold
arrog arrogant
artic article
asper aspergers
assho asshole
assis assistance
astro astronaut
aunty aunty
autob autobiography
auxil auxiliary
avand avand
axles axles
babas babas
badha badham
bagun bagunar
baroc baroclinic
beare bearer
beaut beautiful
becke becker
beean beeans
beebl beeblebrox
belle belle
belli belligerent
beln belnd
berli berlin
berli berlin
berth berth
besit besitos
besle besler
beslu bes