# Personality, language, and the articulation of change

![Alt Text](changes.jpg)

## Our questions:

* Can we predict personality type from the language used in change narratives?
* Can we predict language used in change narratives from personality type?

## Step 1: Data cleaning and Big 5 extraction

In [None]:
import pandas as pd
data = pd.read_excel("Personal Change Questionnaire(1-28).xlsx")
data = data.dropna(axis = 1, how = 'all')

In [None]:
data.columns = ['ID', 'Start time', 'Completion time', 'Email',
       'nickname',
       'gender', 'religion', 'freedom',
       'Is talkative', 'Tends to find fault with others',
       'Does a thorough job', 'Is depressed, blue',
       'Is original, comes up with new ideas', 'Is reserved',
       'Is helpful and unselfish with others', 'Can be somewhat careless',
       'Is relaxed, can handle stress well',
       'Is curious about many different things', 'Is full of energy',
       'Starts quarrels with others', 'Is a reliable worker', 'Can be tense',
       'Is ingenious, a deep thinker', 'Generates a lot of enthusiasm',
       'Has a forgiving nature', 'Tends to be disorganised', 'Worries a lot',
       'Has an active imagination', 'Tends to be quiet',
       'Is generally trusting', 'Tends to be lazy',
       'Is emotionally stable, not easily upset', 'Is inventive',
       'Has an assertive personality', 'Can be cold and aloof',
       'Perseveres until the task is finished', 'Can be moody',
       'Values artistic, aesthetic experiences', 'Is sometimes shy, inhibited',
       'Is considerate and kind to almost everyone', 'Does things efficiently',
       'Remains calm in tense situations', 'Prefers work that is routine',
       'Is outgoing, sociable', 'Is sometimes rude to others',
       'Makes plans and follows through with them', 'Gets nervous easily',
       'Likes to reflect, play with ideas', 'Has few artistic interests',
       'Likes to cooperate with others', 'Is easily distracted',
       'Is sophisticated in art, music, or literature',
       'narrative',
       'age',
       'intensity',
       'impact',
       'publish']

In [None]:
raw = data

values_t = ['Disagree strongly','Disagree a little','Neither agree nor disagree','Agree a little','Agree strongly']

values_n = [1,2,3,4,5]


mapping = dict(zip(values_t, values_n))



raw = raw.replace(mapping)

def rev(col):
    col_ = (5 - col) +1
    return col_

reversals = ['Is reserved', 'Tends to be quiet', 'Is sometimes shy, inhibited', 'Tends to find fault with others', 'Starts quarrels with others',\
             'Can be cold and aloof', 'Is sometimes rude to others', 'Can be somewhat careless', 'Tends to be disorganised',\
                 'Tends to be lazy', 'Is easily distracted', 'Is relaxed, can handle stress well', 'Is emotionally stable, not easily upset',\
                     'Remains calm in tense situations',  'Prefers work that is routine', 'Has few artistic interests']
    
for i in reversals:
    raw[i] = rev(raw[i])
    
raw_numbered = raw[['Is talkative', 'Tends to find fault with others',
       'Does a thorough job', 'Is depressed, blue',
       'Is original, comes up with new ideas', 'Is reserved',
       'Is helpful and unselfish with others', 'Can be somewhat careless',
       'Is relaxed, can handle stress well',
       'Is curious about many different things', 'Is full of energy',
       'Starts quarrels with others', 'Is a reliable worker', 'Can be tense',
       'Is ingenious, a deep thinker', 'Generates a lot of enthusiasm',
       'Has a forgiving nature', 'Tends to be disorganised', 'Worries a lot',
       'Has an active imagination', 'Tends to be quiet',
       'Is generally trusting', 'Tends to be lazy',
       'Is emotionally stable, not easily upset', 'Is inventive',
       'Has an assertive personality', 'Can be cold and aloof',
       'Perseveres until the task is finished', 'Can be moody',
       'Values artistic, aesthetic experiences', 'Is sometimes shy, inhibited',
       'Is considerate and kind to almost everyone', 'Does things efficiently',
       'Remains calm in tense situations', 'Prefers work that is routine',
       'Is outgoing, sociable', 'Is sometimes rude to others',
       'Makes plans and follows through with them', 'Gets nervous easily',
       'Likes to reflect, play with ideas', 'Has few artistic interests',
       'Likes to cooperate with others', 'Is easily distracted',
       'Is sophisticated in art, music, or literature']]

nums = [str(i) for i in range(1,45)]

raw_numbered.columns = nums


E = pd.concat([raw_numbered['1'], raw_numbered['6'], raw_numbered['11'], raw_numbered['16'], raw_numbered['21'], raw_numbered['26'], raw_numbered['31'], raw_numbered['36']], axis = 1)

A = pd.concat([raw_numbered['2'], raw_numbered['7'], raw_numbered['12'], raw_numbered['17'], raw_numbered['22'], raw_numbered['27'], raw_numbered['32'], raw_numbered['37'], raw_numbered['42']], axis = 1)

C = pd.concat([raw_numbered['3'],  raw_numbered['8'], raw_numbered['13'], raw_numbered['18'], raw_numbered['23'], raw_numbered['28'], raw_numbered['33'], raw_numbered['38'], raw_numbered['43']], axis = 1)

N = pd.concat([raw_numbered['4'],  raw_numbered['9'], raw_numbered['14'], raw_numbered['19'], raw_numbered['24'], raw_numbered['29'], raw_numbered['34'], raw_numbered['39']], axis = 1)

O = pd.concat([raw_numbered['5'], raw_numbered['10'], raw_numbered['15'], raw_numbered['20'], raw_numbered['25'], raw_numbered['30'], raw_numbered['35'], raw_numbered['40'], raw_numbered['25'], raw_numbered['41'], raw_numbered['44']], axis = 1)




raw['extraversion'] = E.mean(axis = 1)
raw['agreeableness'] = A.mean(axis = 1)
raw['conscientiousness'] = C.mean(axis = 1)
raw['neuroticism'] = N.mean(axis = 1)
raw['openness'] = O.mean(axis = 1)

raw = raw[['extraversion', 'agreeableness', 'conscientiousness', 'openness',
       'neuroticism']]

raw['nickname'] = data['nickname']

In [None]:
raw['religion'] = data['religion']
raw['freedom'] = data['freedom']
raw['narrative'] = data['narrative']
raw['intensity'] = data['intensity']
raw['age'] = data['age']
raw = raw.replace(mapping)

[![YouTube Video](https://img.youtube.com/vi/p0FdAM9knDo/0.jpg)](https://www.youtube.com/watch?v=p0FdAM9knDo)

## Step 2: Consolidation of Big 5 scores into stability and plasticity

![Alt Text](superfactor.png)

In [None]:
from scipy.stats import zscore

big_five = ['extraversion', 'agreeableness', 'conscientiousness', 'openness', 'neuroticism']
five = raw[big_five]
df_z = five.apply(zscore)

# Calculate Stability and Plasticity
five['stability'] = (
    df_z['agreeableness'] +
    df_z['conscientiousness'] +
    (-df_z['neuroticism'])
)

five['plasticity'] = (
    df_z['extraversion'] +
    df_z['openness']
)

raw['plasticity'] = five['plasticity']
raw['stability'] = five['stability']

[![YouTube Video](https://img.youtube.com/vi/sNgiN8hXK6M/0.jpg)](https://www.youtube.com/watch?v=sNgiN8hXK6M)

## Step 3: NLP analysis

In [None]:
import plotly.graph_objects as go
!pip install pingouin
!python -m spacy download en_core_web_md
import pingouin as pg
import plotly.express as px
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import spacy
import re
import pingouin
nlp = spacy.load('en_core_web_md')
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

nlp = spacy.load("en_core_web_md")
nlp.vocab["\n"].is_stop = True
nlp.max_length = 4353682

def process(text):
    parsed_text = nlp(text)
    full_vocab = [token.lemma_.lower() for token in parsed_text \
                   if not token.is_stop and\
                   not token.is_punct
                  #below I add some new criteria - CR
                  and not token.text.strip() == ''       #remove empty text
                  and token.is_ascii
                  and re.match('[a-zA-Z]',token.text) #remove non ascii
                  and not re.match('^[\n]+$',token.text) #remove multiple line breaks
                  and not token.like_url                 #remove urls
                  and not '&nbsp' in token.text and not token.like_num]         # remove html garble
    return full_vocab

In [None]:
# Word norm data

norms = pd.read_pickle("all_norm_estimates.pkl")

In [None]:
# Word norm functions

def word_norms(text):
    lemmas = process(text)

    words = []
    norms_ = []

    for i in lemmas:
        if i in norms.index:
            norms_.append(norms.loc[i])
            words.append(i)
        else:
            pass
    norms_df = pd.DataFrame(norms, index = words)
    return norms_df

def word_norms_mean(text):
    lemmas = process(text)

    words = []
    norms_ = []

    for i in lemmas:
        if i in norms.index:
            norms_.append(norms.loc[i])
            words.append(i)
        else:
            pass
    norms_df = pd.DataFrame(norms_)
    return norms_df.mean()

In [None]:
# Extract word norms

tales = [i for i in raw['narrative']]
norm = [word_norms_mean(i) for i in tales]
norms_all = pd.DataFrame(norm)
raw = pd.concat([raw, norms_all], axis = 1)

In [None]:
# Get spaCy embeddings for each narrative

vectors = [nlp(i).vector for i in raw['narrative']]
vecs_df = pd.DataFrame(vectors)

In [None]:
# Identify the the optimal number of components to use

import matplotlib.pyplot as plt
import numpy as np

# Fit PCA with many components first
pca = PCA(n_components=28)  # or even larger
X_pca = pca.fit_transform(vecs_df)

# Plot cumulative explained variance
plt.figure(figsize=(8,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid(True)
plt.show()

In [None]:
# Add the identified components to the data

pca = PCA(n_components = 3)
comps = pca.fit_transform(vecs_df)
pc_df = pd.DataFrame(data = comps, columns = ['PC'+str(i) for i in range(1, comps.shape[1]+1)])
raw = pd.concat([raw, pc_df], axis = 1)

In [None]:
fig = px.scatter_3d(
    raw,
    x="PC1",
    y="PC2",
    z="PC3",
    hover_data=['nickname'],
    color_discrete_sequence=['red']  # Force all markers to red
)

fig.update_layout(
    title="Change narrratives Immersion Week 25 T3",
    paper_bgcolor="#343434",  # Outer background
    plot_bgcolor="#343434",   # Should affect 2D plots, but 3D requires scene settings
    font=dict(color='white'),
    scene=dict(
        xaxis=dict(
            showgrid=True,
            gridcolor='rgba(150, 150, 150, 0.5)',  # Light gray gridlines
            #title= x,
            backgroundcolor="#343434"  # Match axis background
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor='rgba(150, 150, 150, 0.5)',
            #title= y,
            backgroundcolor="#343434"
        ),
        zaxis=dict(
            showgrid=True,
            gridcolor='rgba(150, 150, 150, 0.5)',
            #title= z,
            backgroundcolor="#343434"
        ),
        bgcolor="#343434"  # Critical: Sets the 3D plot's facecolor
    )
)

fig.update_traces(
    marker=dict(
        size=5,
        color='red',  # Explicit red markers
        opacity=0.8
    )
)

fig.show()

[![YouTube Video](https://img.youtube.com/vi/IJSv6JXKS_I/0.jpg)](https://www.youtube.com/watch?v=IJSv6JXKS_I)

## Step 4: Some data science

### OLS regression

In [None]:
lm = pg.linear_regression(raw[['valence', 'arousal',
       'dominance', 'auditory', 'gustatory', 'interoceptive', 'olfactory',
       'visual', 'foot_leg', 'hand_arm', 'head', 'mouth', 'torso',
       'concreteness', 'imageability', 'semantic_size', 'haptic']], raw['plasticity'])
lm

In [None]:
sns.heatmap(raw[['valence', 'arousal',
       'dominance', 'auditory', 'gustatory', 'interoceptive', 'olfactory',
       'visual', 'foot_leg', 'hand_arm', 'head', 'mouth', 'torso',
       'concreteness', 'imageability', 'semantic_size', 'haptic']].corr())

### Ridge regression

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score


# Your predictors:
X = raw[['valence', 'arousal', 'dominance', 'auditory', 'gustatory', 'interoceptive', 'olfactory', 
         'visual', 'foot_leg', 'hand_arm', 'head', 'mouth', 'torso',
         'concreteness', 'imageability', 'semantic_size', 'haptic']]

# Your DV:

variable = 'stability'

y = raw[variable]




In [None]:
# RidgeCV automatically tunes alpha (regularization strength)
ridge = RidgeCV(alphas=np.logspace(-3, 3, 100), cv=5)

# Build pipeline: scaling is important for Ridge!
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize predictors
    ('ridge', ridge)
])

# Cross-validated R²
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')

print(f"Mean CV R²: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


In [None]:
# Now fit final model on full data
pipeline.fit(X, y)

# Extract fitted RidgeCV from pipeline
ridge_model = pipeline.named_steps['ridge']

# Display coefficients in a nice table
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': ridge_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(coef_df)

# Plot coefficients (optional)
plt.figure(figsize=(10,6))
plt.barh(coef_df['Feature'], coef_df['Coefficient'])
plt.xlabel('Ridge Coefficient')
plt.title('Feature importance for '+variable+' (Ridge regression)')
plt.gca().invert_yaxis()
plt.grid(True)
plt.show()


[![YouTube Video](https://img.youtube.com/vi/n4RjJKxsamQ/0.jpg)](https://www.youtube.com/watch?v=n4RjJKxsamQ)