# Capstone Discovery Problem Space: Language and cognition

In [None]:
!pip install pingouin
from PIL import Image
import plotly.graph_objects as go
import requests
import pingouin as pg
import plotly.express as px
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import spacy
import re
import pingouin
nlp = spacy.load('en_core_web_sm')
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

custom_style = {
    "axes.facecolor": "#343434",
    "axes.edgecolor": "white",
    "axes.labelcolor": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "figure.facecolor": "#343434",
    "grid.color": "none",  # Remove gridlines
    "text.color": "white",
    "lines.color": "white",  # Default line color
    "lines.markeredgecolor": "white",  # Default marker edge color
    "lines.markerfacecolor": "white",  # Default marker face color
}

# Apply the custom style
plt.rcParams.update(custom_style)

nlp = spacy.load("en_core_web_sm")
nlp.vocab["\n"].is_stop = True
nlp.max_length = 4353682

def process(text):
    parsed_text = nlp(text)
    full_vocab = [token.lemma_.lower() for token in parsed_text \
                   if not token.is_stop and\
                   not token.is_punct
                  #below I add some new criteria - CR
                  and not token.text.strip() == ''       #remove empty text
                  and token.is_ascii
                  and re.match('[a-zA-Z]',token.text) #remove non ascii
                  and not re.match('^[\n]+$',token.text) #remove multiple line breaks
                  and not token.like_url                 #remove urls
                  and not '&nbsp' in token.text and not token.like_num]         # remove html garble
    return full_vocab

## Let's load some word norm data; these are numerical ratings of words along different cognitive and emotional dimensions

In [7]:
norms = pd.read_pickle("https://raw.githubusercontent.com/texturejc/IGEL/main/public_norms.pkl")

# Note that this is unpublished data that I've compiled from various published sources. It's been rescaled
# between 0 and 1 for all values, and some values are extraploated using machine learning. If you use it, please
# note these caveats!

In [None]:
norms_sample = norms.sample(n= 20)

sns.barplot(x = norms_sample.index, y = 'concreteness', color = 'white', data = norms_sample)
plt.xticks(rotation=85)
plt.show()




In [None]:
sns.displot(norms['gustatory'], color = 'white', kind = 'kde')

In [None]:
categories = norms.columns

word_1 = "concept"
word_2 = "cat"

word_1_values = norms.loc[word_1]
word_2_values = norms.loc[word_2]

# Create radar chart
fig = go.Figure()

fig.add_trace(go.Scatterpolar(
    r=word_1_values,
    theta=categories,
    fill='toself',
    name=word_1,
    line=dict(color='blue')
))

fig.add_trace(go.Scatterpolar(
    r=word_2_values,
    theta=categories,
    fill='toself',
    name=word_2,
    line=dict(color='red')
))

# Update layout for higher resolution and clearer text
fig.update_layout(
    polar=dict(
        bgcolor='#343434',
        radialaxis=dict(visible=True, range=[0, 1], showticklabels=True, ticks=''),
        angularaxis=dict(showticklabels=True, ticks='')
    ),
    paper_bgcolor='#343434',
    plot_bgcolor='#343434',
    showlegend=True,
    legend=dict(
        font=dict(size=18, color='white')
    ),
    font=dict(size=16, color='white'),
    width=800,
    height=600
)


# Display the figure
fig.show()


## But what use is this data? Let's take Shakespeare's Sonnets as a starting point

### Load the sonnets from a text file and clean the text. The variable `clean` contains the individual sonnets

In [8]:
sonnets = "https://raw.githubusercontent.com/texturejc/IGEL/main/sonnets.txt"

texts = requests.get(sonnets)
texts = texts.text

sons = texts.split("\r\n\r\n\r\n")

clean = []

for i in sons:
    i = i.replace('\ufeff', '')
    i = i.replace('\n', ' ')
    i = i.replace('\r', ' ')
    i = i.strip()
    clean.append(i)

clean = [i.lower() for i in clean]


In [None]:
print(sons[0])

## Create two python functions that turns the texts into lemmas and scores them for our variables of interest

In [10]:
def word_norms(text):
    lemmas = process(text)

    words = []
    norms_ = []

    for i in lemmas:
        if i in norms.index:
            norms_.append(norms.loc[i])
            words.append(i)
        else:
            pass
    norms_df = pd.DataFrame(norms, index = words)
    return norms_df

def word_norms_mean(text):
    lemmas = process(text)

    words = []
    norms_ = []

    for i in lemmas:
        if i in norms.index:
            norms_.append(norms.loc[i])
            words.append(i)
        else:
            pass
    norms_df = pd.DataFrame(norms_)
    return norms_df.mean()

In [11]:
word_norms_mean(clean[1])

valence          0.606014
arousal          0.391034
dominance        0.592439
auditory         0.329886
gustatory        0.083915
interoceptive    0.276977
olfactory        0.107362
visual           0.601539
foot_leg         0.191936
hand_arm         0.321097
head             0.496878
mouth            0.299357
torso            0.218656
concreteness     0.447777
imageability     0.499000
semantic_size    0.531463
haptic           0.276342
dtype: float32

### Let's get all our sonnets into a dataframe so we can work with them quantitatively

In [12]:
over_time = []

for i in clean:
  over_time.append(word_norms_mean(i))

over_time_df = pd.DataFrame(over_time)
name = ["Sonnet "+str(i+1) for i in over_time_df.index]
over_time_df['name'] = name

In [None]:
over_time_df

In [None]:
fig = px.scatter(over_time_df, x=over_time_df.index, y="valence", hover_data = ['name'], trendline="ols")
fig.update_layout(
    title= "Trend over time in Shakespeare's Sonnets",
    paper_bgcolor="#343434",
    plot_bgcolor="#343434",
    font=dict(color='white'),
    xaxis=dict(showgrid=True, title="Sonnet"),
    yaxis=dict(showgrid=True)
)
fig.update_traces(line=dict(color='red'), selector=dict(mode='lines'))
fig.show()

In [None]:
lm = pg.linear_regression(over_time_df.index, over_time_df['valence'])
lm

## So what else can we do with this data? Let's take a look at a corpus of Twitter data

In [51]:
twitter = pd.read_pickle('https://raw.githubusercontent.com/texturejc/capstone_discovery_language/main/twitter_gender.pkl')

In [53]:
twitter.columns

Index(['gender', 'text', 'retweet_count', 'valence', 'arousal', 'dominance',
       'auditory', 'gustatory', 'interoceptive', 'olfactory', 'visual',
       'foot_leg', 'hand_arm', 'head', 'mouth', 'torso', 'concreteness',
       'imageability', 'semantic_size', 'haptic', 'processed_text'],
      dtype='object')

In [None]:
word = ['cat', 'dog']

filtered_df = twitter[twitter['processed_text'].apply(lambda text: any(word in text for word in word))]
