# Import the relevant libraries

In [1]:
import pandas as pd #data science library
import seaborn as sns #plotting library
sns.set() #sets style of seaborn plots
import plotly.express as px #plotting library for fancy interactive plots
#!pip install jupyterlab "ipywidgets>=7.5"
import matplotlib.pyplot as plt #yet another plotting library, python's main one
from nltk.corpus import wordnet as wn #A corpus of language from WordNet, a large linguistic resource
from nltk.stem.wordnet import WordNetLemmatizer #A Lemmatizer reduces a word to its root form; this imports it.
lemmatizer = WordNetLemmatizer() #This 'creates' the lemmatizer
from nltk.tokenize import word_tokenize #A Tokenizer breaks text into chunks, usually words
from sklearn.preprocessing import MinMaxScaler #This allows us to rescale a column of values to between 0 and 1.

# Import the word norm data

In [2]:
vad = pd.read_excel('vad.xlsx', index_col = 0)  #VAD norms
sm = pd.read_excel('sensorimotor.xlsx', index_col = 0) #Sensorimotor norms
sm = sm[['auditory', 'gustatory', 'haptic', 'interoceptive', 'olfactory',
       'visual', 'foot_leg', 'hand_arm', 'head', 'mouth', 'torso']]



scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(sm)

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=sm.columns)
scaled_df.index = sm.index

all_norms = vad.merge(scaled_df, left_index=True, right_index=True) #This creates a dataframe with all 
                                                                    #our word norm data scaled between 0 and 1

# Create a word norm dataframe from raw text that preserves values for each word and one the gives the average of all the words

In [4]:

def word_norms(text):
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(i.lower()) for i in tokens]
    
    words = []
    norms = []
    
    for i in lemmas:
        if i in all_norms.index:
            norms.append(sm.loc[i])
            words.append(i)
        else:
            pass
    norms_df = pd.DataFrame(norms, index = words)
    return norms_df

def word_norms_mean(text):
    tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(i.lower()) for i in tokens]
    
    words = []
    norms = []
    
    for i in lemmas:
        if i in all_norms.index:
            norms.append(sm.loc[i])
            words.append(i)
        else:
            pass
    norms_df = pd.DataFrame(norms)
    return norms_df.mean()


# Example 1: Shakespeare's Sonnets

In [5]:
with open("sonnets.txt", "r") as f:
    son = f.read()

In [6]:
sons = son.split("\n\n\n")

In [7]:
clean = []

for i in sons:
    i = i.replace('\ufeff', '')
    i = i.replace('\n', ' ')
    i = i.strip()
    clean.append(i)
