# Rizzlite Word Ranking

### Import necessary libraries

In [2]:
import requests
import pandas as pd
import random
import re
import contractions
from random import uniform
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/skylerestavillo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Create personalized list of stop words, which will be set to neutral score

In [3]:
# Create personalized list of stop words
stop_words = [ "of", "should", "not", "are", "i", "her", "here", "their", "again", "can", "above", "these", "will", "all", 
"them", "has", "she", "him", "his", "itself", "it", "is", "out", "had", "he", "hers", "because", "were", "than", "not",
"and", "under", "during", "into", "am", "have", "yours", "a", "some", "have", "has", "ours", "or", "by", "our", "at",
"on", "same", "you", "does", "was", "did", "theirs", "herself", "himself",
"does", "they", "up", "between", "such", "both", "nor", "having", "are", "an", "no", "ain't", "as", "before", "with",
"have", "other", "she", "in", "for", "themselves", "do", "the", "against", "so", "ourselves", "to", "did",
"doing", "each", "been", "has", "after", "off", "but", "through", "it", "this", "own",
"any", "now", "if", "while", "down", "only", "being", "my", "had", "we", "then", "until", "from",
"further", "there", "that", "went", "those"]

word_scores = {}

for word in set(stop_words):
    word_scores[word] = 3

### Read csv file, preprocess, and set scores

In [5]:
# Read in the csv file
df = pd.read_csv('SentenceCorpus.csv')


# Define a function to preprocess each sentence
def preprocess_sentence(sentence):
    # Convert the sentence to lowercase
    sentence = sentence.lower()
    
    # Remove words containing numbers from the sentence
    sentence = re.sub(r'\b\w*\d\w*\b', '', sentence)
    
    # Break apart contractions using the contractions library
    sentence = contractions.fix(sentence)
    
    # Remove all non-alphabetic characters from the sentence
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
    
    # Ensure the sentence is returned as a string of words
    sentence = ' '.join(sentence.split())
    
    return sentence

# Preprocess each sentence in the 'Sentence' column
df['Sentence'] = df['Sentence'].apply(preprocess_sentence)

# Print the updated dataframe
print(df.head())

# Define a function to tokenize a sentence and return a list of words
def tokenize(sentence):
    return sentence.split()

# Update the word_scores dictionary with words and their scores
for index, row in df.iterrows():
    sentence = row['Sentence']
    score = row['Score']
    words = tokenize(sentence)

    for word in words:
        if word not in stop_words:
            if word not in word_scores:
                word_scores[word] = [score]
            else:
                word_scores[word].append(score)

# Calculate the average score for each word
word_final_scores = {}
for word, scores in word_scores.items():
    if isinstance(scores, list):
        word_final_scores[word] = sum(scores) / len(scores)
    else:
        word_final_scores[word] = scores

       id language                                   Sentence  Score
0  1276.0      eng                       let us try something    3.5
1  1277.0      eng                      i have to go to sleep    3.7
2  1280.0      eng  today is june and it is muiriels birthday    4.2
3  1282.0      eng                             muiriel is now    4.8
4  1283.0      eng                    the password is muiriel    4.8


### Test our dictionary

In [6]:
# Print the count of all words in the dictionary
print(f"Total words in the dictionary: {len(word_final_scores)}")

# Print 10 random words in the dictionary with their associated scores
random_words = random.sample(list(word_final_scores.items()), 10)
for word, score in random_words:
    print(f"{word}: {score}")

Total words in the dictionary: 2456
ideals: 3.4
fixed: 4.1
corns: 4.9
i: 3
notorious: 3.4
luggage: 3.15
poorest: 3.6
includes: 3.1
verbose: 3.3
highest: 2.5


### Define API Request Functions

In [7]:
API_KEY = "0191a99ddfmsh88ff5e3602511e9p1bf4e8jsn6e7eb376b38d"

# Define synonym API request
def get_synonyms(word):
    url = f"https://wordsapiv1.p.rapidapi.com/words/{word}/synonyms"
    headers = {
        "content-type": "application/octet-stream",
        "X-RapidAPI-Key": API_KEY,
        "X-RapidAPI-Host": "wordsapiv1.p.rapidapi.com"
    }
    response = requests.get(url, headers=headers)
    return response.json().get('synonyms', [])


# Definition definition API request
def get_definition(word):
    url = f"https://wordsapiv1.p.rapidapi.com/words/{word}/definitions"
    headers = {
        "content-type": "application/octet-stream",
        "X-RapidAPI-Key": API_KEY,
        "X-RapidAPI-Host": "wordsapiv1.p.rapidapi.com"
    }
    response = requests.get(url, headers=headers)
    return response.json().get('definitions', [])

## Begin machine learning to extend word scores to new words

In [8]:
# Import model
from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.models import KeyedVectors

glove_model = api.load('glove-twitter-200')

### Define k nearest neighbor function calls

In [2]:
called_words = set()
used_words = set()

def k_nearest_neighbors(word, k=5, use_api=True, api_limit=50, depth=0):
    global api_calls, called_words
    api_calls_made = 0  # Initialize a variable to keep track of API calls made within the function

    if word in word_final_scores:
        return api_calls_made

    # Use the Words API to find synonyms if the use_api flag is set to True and the API limit is not exceeded
    if use_api and word not in called_words and api_limit > 0 and api_calls < api_limit:
        called_words.add(word)
        api_calls_made += 1  # Increment the API calls counter

        synonyms = get_synonyms(word)
        for synonym in synonyms[:k]:
            if synonym not in word_final_scores:
                api_calls_made += 1  # Increment the API calls counter
                definitions = get_definition(synonym)
                definition_score = 0
                word_count = 0
                for definition in definitions:
                    for def_word in definition['definition'].split():
                        if def_word in word_final_scores:
                            definition_score += word_final_scores[def_word]
                            word_count += 1
                        elif depth < 2:  # Check if the current depth is less than 2
                            api_calls_made += k_nearest_neighbors(def_word, depth=depth + 1)  # Recursively call the function with incremented depth

                if word_count > 0:
                    average_definition_score = definition_score / word_count
                    score_adjustment = uniform(-0.15, 0.15)

                    if average_definition_score > word_scores[word]:
                        new_score = min(5, word_scores[word] + score_adjustment)
                    else:
                        new_score = max(1, word_scores[word] + score_adjustment)

                    word_final_scores[synonym] = new_score
                else:
                    word_final_scores[synonym] = word_scores[word]
    else:
        # Use the GloVe model to find related words
        related_words = glove_model.most_similar(positive=[word], topn=k)

        # Calculate the average confidence score of the nearest neighbors
        scored_neighbors = [neighbor for neighbor, _ in related_words if neighbor in word_final_scores]
        if len(scored_neighbors) >= 1:
            avg_score = sum(word_final_scores[neighbor] for neighbor in scored_neighbors) / len(scored_neighbors)
            word_final_scores[word] = avg_score  # Store the average score in the dictionary
        else:
            # If at least 1 out of 5 nearest neighbors have not been scored, recursively call the GloVe model
            for neighbor, _ in related_words:
                if neighbor not in word_final_scores and depth < 2:  # Check if the current depth is less than 2
                    api_calls_made += k_nearest_neighbors(neighbor, depth=depth + 1)  # Recursively call the function with incremented depth

    return api_calls_made  # Return the number of API calls made







import random
import csv

api_limit = 50
api_calls = 0

def random_word_k_nearest_neighbors():
    global api_calls, used_words
    while True:
        # Get the list of non-stop words not in used_words
        remaining_words = [word for word in word_scores.keys() if word not in stop_words and word not in used_words]

        # Break the loop if there are no remaining words
        if not remaining_words:
            break

        # Select a random non-stop word from the remaining_words list
        random_word = random.choice(remaining_words)
        used_words.add(random_word)
        print(random_word)

        # Check if the random_word is in the GloVe vocabulary
        if glove_model.has_index_for(random_word):
            # Call k_nearest_neighbors with the random word and use_api set to True if we haven't exceeded the API limit
            use_api = api_calls < api_limit
            api_calls_made = k_nearest_neighbors(random_word, use_api=use_api)
            api_calls += api_calls_made  # Add the API calls made within k_nearest_neighbors to the global API calls variable

            print(f"Random word: {random_word}, Total API calls: {api_calls}")
        else:
            print(f"{random_word} not in GloVe vocabulary")

        # Print the count of all words in the dictionary
        print(f"Total words in the dictionary: {len(word_final_scores)}")

        # Write the updated word_final_scores to a CSV file
        with open('word_scores.csv', 'w', newline='') as csvfile:   
            fieldnames = ['Word', 'Score']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for word, scores in word_scores.items():
                avg_score = sum(scores) / len(scores) if isinstance(scores, list) else scores
                writer.writerow({'Word': word, 'Score': avg_score})

        # Break the loop if you want to stop after a certain number of iterations
        # For example, you can stop when the used_words set reaches a certain size
        if len(used_words) >= 80000:
            break




### Call our function

In [28]:
random_word_k_nearest_neighbors()

shape
Random word: shape, Total API calls: 0
Total words in the dictionary: 2646
though
Random word: though, Total API calls: 0
Total words in the dictionary: 2646
future
Random word: future, Total API calls: 0
Total words in the dictionary: 2646
term
Random word: term, Total API calls: 0
Total words in the dictionary: 2646
juice
Random word: juice, Total API calls: 0
Total words in the dictionary: 2646
whatever
Random word: whatever, Total API calls: 0
Total words in the dictionary: 2646
slowest
Random word: slowest, Total API calls: 0
Total words in the dictionary: 2646
early
Random word: early, Total API calls: 0
Total words in the dictionary: 2646
den
Random word: den, Total API calls: 0
Total words in the dictionary: 2646
mammals
Random word: mammals, Total API calls: 0
Total words in the dictionary: 2646
map
Random word: map, Total API calls: 0
Total words in the dictionary: 2646
coming
Random word: coming, Total API calls: 0
Total words in the dictionary: 2646
vision
Random word

KeyboardInterrupt: 

## Write data to a csv file