<a href="https://colab.research.google.com/github/ved-phadke/math-m148-final-project/blob/main/text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Pre-processing

This notebook is to extract text features from the dataframe of transcripts.

## Imports

In [None]:
import pandas as pd
import nltk
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util
import spacy
from textblob import TextBlob
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
import unicodedata
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
try:
    nltk.data.find('vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')
from sklearn.decomposition import NMF



## Reading in Data

In [None]:
df = pd.read_csv("/content/stim_df.csv", encoding = 'latin-1')

## TF-IDF Work

In [None]:
# Load the English language model for spaCy
nlp = spacy.load("en_core_web_sm")

# Convert transcripts and descriptions to string format and clean encoding issues
df['transcript'] = df['transcript'].astype(str).apply(lambda x: x.encode('latin1').decode('utf-8', 'ignore'))
df['Description '] = df['Description '].astype(str).apply(lambda x: x.encode('latin1').decode('utf-8', 'ignore'))

# Remove unintended newlines (except for those following a period)
df['transcript_clean'] = df['transcript'].astype(str).apply(lambda x: re.sub(r'(?<!\.)\n(?!\.)', ' ', x))

# Remove any non-alphanumeric characters except spaces, punctuation (. , ! ?)
df['transcript_clean'] = df['transcript_clean'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s.,!?]', '', str(x)))

# Compute word count by splitting on whitespace
df['word_count'] = df['transcript_clean'].apply(lambda x: len(str(x).split()))

# Compute sentence count by splitting on periods
df['sentence_count'] = df['transcript_clean'].apply(lambda x: len(str(x).split('.')))

# Compute sentiment scores
df['tb_sentiment'] = df['transcript_clean'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)  # TextBlob Sentiment
df['vader_sentiment'] = df['transcript_clean'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(str(x))['compound'])  # Vader Sentiment

# Perform Part-of-Speech (POS) tagging using spaCy
df['pos_nouns'] = df['transcript_clean'].apply(lambda x: len([token.text for token in nlp(str(x)) if token.pos_ == "NOUN"]))  # Count Nouns
df['pos_adjectives'] = df['transcript_clean'].apply(lambda x: len([token.text for token in nlp(str(x)) if token.pos_ == "ADJ"]))  # Count Adjectives


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
# Extend default stop words with additional words if needed
my_stop_words = text.ENGLISH_STOP_WORDS.union()  # You can add custom stopwords here if necessary

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=500,  # Limit the number of features (top 500 words)
    stop_words='english'  # Remove common English stopwords
)

# Apply TF-IDF transformation on the cleaned transcript column (handling missing values)
tfidf_matrix = vectorizer.fit_transform(df['transcript_clean'].fillna(''))

# Convert the sparse TF-IDF matrix into a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the TF-IDF features with the original DataFrame
df_tfidf = pd.concat([df, tfidf_df], axis=1)


## NMF Topic Modeling

In [None]:
def nmf_topic_model(df_tfidf, num_topics=5):
    """
    Applies Non-negative Matrix Factorization (NMF) to topic model the tf-idf matrix.

    Args:
        df_tfidf: DataFrame containing the TF-IDF matrix.
        num_topics: The desired number of topics.

    Returns:
        A tuple containing:
            - W: The document-topic matrix.
            - H: The topic-term matrix.
            - nmf_model: The fitted NMF model
    """
    tfidf_cols = [col for col in df_tfidf.columns if col not in df.columns]
    nmf_model = NMF(n_components=num_topics, random_state=42)
    W = nmf_model.fit_transform(df_tfidf[tfidf_cols])  # Use only the TF-IDF columns
    H = nmf_model.components_
    return W, H, nmf_model

# Apply the function
W, H, nmf = nmf_topic_model(df_tfidf)

# Convert to dfs
w_df = pd.DataFrame(W, columns=[f'topic_{i+1}' for i in range(W.shape[1])])
h_df = pd.DataFrame(H, columns=[col for col in df_tfidf.columns if col not in df.columns]).transpose()


In [None]:
# Small function to try and see what text values are grouped together in a topic
def get_top_indices(df):
    return df.apply(lambda x: x.sort_values(ascending=False).head(10).index.tolist())

top_indices = get_top_indices(h_df)
top_indices


Unnamed: 0,0,1,2,3,4
0,,im,men,come,flower
1,verse,know,power,tooth,beauty
2,tree,life,hate,sweaty,adds
3,shore,just,let,itll,interesting
4,hallelujah,going,dont,madman,sees
5,rainbows,dont,fight,todd,artist
6,beings,youre,world,anderson,insects
7,human,day,people,dont,agree
8,did,feel,want,think,dont
9,stranger,like,machine,mr,believe


In [None]:
w_df

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5
0,0.5885761,0.0,0.0,0.0,0.0
1,0.5885761,0.0,0.0,0.0,0.0
2,0.0,0.372883,0.0,0.0,0.0
3,0.0,0.33759,0.037944,0.0,0.018957
4,0.0,0.159285,0.054683,0.0,0.0
5,6.0964e-25,0.056484,0.0,1.7e-05,0.002581
6,0.0,0.320493,0.0,0.0,0.095922
7,0.5885761,0.0,0.0,0.0,0.0
8,7.165726e-26,0.1171,0.001299,0.093933,0.004911
9,0.5885761,0.0,0.0,0.0,0.0


In [None]:
df = pd.concat([df, w_df], axis=1)

## More TF-IDF Manipulation

In [None]:
# Tokenize transcript into sentences, ensuring at least one sentence per entry
df['transcript_sentences'] = df['transcript_clean'].apply(
    lambda x: sent_tokenize(x) if len(sent_tokenize(x)) > 1 else [x]
)


def lacks_punctuation(text):
    """
    Determines if a text lacks punctuation based on the punctuation-to-word ratio.

    Parameters:
    text (str): The input text to evaluate.

    Returns:
    bool: True if punctuation density is low (below 2%), otherwise False.
    """
    return sum(1 for char in text if char in ".!?") / max(1, len(text.split())) < 0.02


# Apply the function to identify texts with very few punctuation marks
df['lacks_punctuation'] = df['transcript_clean'].apply(lacks_punctuation)


def split_into_phrases(text):
    """
    Splits text into smaller phrases using common discourse markers
    or chunks of 10 words as a fallback.

    Parameters:
    text (str): The input text to split.

    Returns:
    list: A list of smaller text chunks.
    """
    # Define common discourse markers for natural breaks in speech/writing
    markers = r'\b(and|but|so|then|therefore|however|because|although|if|when|while)\b'

    # Split the text using the discourse markers
    split_text = re.split(markers, text, flags=re.IGNORECASE)

    # Further split large sections into chunks of 10 words if necessary
    phrase_chunks = []
    for phrase in split_text:
        words = phrase.split()
        phrase_chunks.extend([' '.join(words[i:i+10]) for i in range(0, len(words), 10)])

    return phrase_chunks if phrase_chunks else [text]  # Ensure at least one chunk


# Adjust sentence segmentation: Use sentence tokenization unless the text lacks punctuation
df['transcript_sentences'] = df.apply(
    lambda row: sent_tokenize(row['transcript_clean'])
    if not row['lacks_punctuation']
    else split_into_phrases(row['transcript_clean']),
    axis=1
)


## Read in Chills_DB

In [None]:
chills_db = pd.read_csv("/content/ChillsDB 2 - ChillsDB 2.csv")
chills_db['Chills?'].replace({'Yes': 1, 'No': 0}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  chills_db['Chills?'].replace({'Yes': 1, 'No': 0}, inplace=True)
  chills_db['Chills?'].replace({'Yes': 1, 'No': 0}, inplace=True)


In [None]:
def normalize_stimulus_name(name):
    """
    Normalize a stimulus name by:
    - Stripping whitespace and lowercasing,
    - Normalizing Unicode characters,
    - Removing punctuation (keeping only alphanumeric characters and spaces),
    - Collapsing multiple spaces.
    """
    name = str(name).strip().lower()
    name = unicodedata.normalize('NFKD', name)
    name = re.sub(r'[^a-z0-9\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    return name

def aggregate_chills_response(val):
    """
    Convert a concatenated string of "Yes" and "No" responses into a numeric value:
    The proportion of "Yes" responses.
    """
    s = str(val).lower()
    yes_count = len(re.findall(r'yes', s))
    no_count = len(re.findall(r'no', s))
    total = yes_count + no_count
    return yes_count / total if total > 0 else None



def process_and_merge_chills_data(chills_db, df):
    """
    Reads the ChillsDB dataset with the correct encoding, aggregates chills data,
    and merges it with the main dataframe after applying a manual mapping to resolve
    stimulus name mismatches.

    Parameters:
    - chills_file_path (str): Path to the ChillsDB dataset CSV file.
    - df (pd.DataFrame): The main dataset containing stimulus information.

    Returns:
    - pd.DataFrame: The merged dataframe with average chills and chills intensity.
    """
    # Copy the main dataframe to avoid modifying the original
    main_df = df.copy()

    # Step 1: Read the ChillsDB dataset with the correct encoding
    df_chills_responses = chills_db
    #df_chills_responses['Chills?'].replace({'Yes': 1, 'No': 0}, inplace=True)


    # Step 2: Ensure column names are consistent
    df_chills_responses.rename(columns={'Stimulus': 'Stimulus name',
                                         'Chills intensity': 'Chills Intensity'}, inplace=True)

    # if 'Chills?' in df_chills_responses.columns:
    #     df_chills_responses['Chills?'] = df_chills_responses['Chills?'].apply(aggregate_chills_response)

    # Step 3: Normalize 'Stimulus name' columns in both dataframes
    df_chills_responses['Stimulus name'] = df_chills_responses['Stimulus name'].apply(normalize_stimulus_name)
    main_df['Stimulus name'] = main_df['Stimulus name'].apply(normalize_stimulus_name)

    # Step 4: Aggregate responses: Compute the average chills and chills intensity per stimulus
    df_chills_avg = df_chills_responses.groupby('Stimulus name', as_index=False).agg({
        'Chills?': 'mean',
        '#Chills': 'mean',
        'Chills Intensity': 'mean'
    })

    # Step 5: Define a manual mapping dictionary based on your provided lists.
    # The keys are normalized stimulus names from your main dataframe and the values are
    # the corresponding normalized names from the chills_db.
    manual_mapping = {
        normalize_stimulus_name("Misere Mei, Deus (Audio)"): normalize_stimulus_name("Miserere Me (Audio)"),
        normalize_stimulus_name("3rd Grade Dropout Speech (Audio)"): normalize_stimulus_name("3rd Grade Drop Out (Audio)"),
        normalize_stimulus_name("Unbroken - Motivation (Audio)"): normalize_stimulus_name("Unbroken (Audio)"),
        normalize_stimulus_name("We Think Too Much and Feel Too Little (Audio)"): normalize_stimulus_name("Think Too Much Feel Too Little"),
        normalize_stimulus_name("The Great Dictator (Audio)"): normalize_stimulus_name("Great Dictator (Audio)"),
        normalize_stimulus_name("The Great Dictator"): normalize_stimulus_name("Great Dictator"),
        normalize_stimulus_name("We Think Too Much and Feel Too Little"): normalize_stimulus_name("Think Too Much Feel Too Little"),
        normalize_stimulus_name("Aramaic Choir (Audio)"): normalize_stimulus_name("Aramaic Choir"),
        normalize_stimulus_name("Dead Poet's Society - YAWP Scene (Audio)"): normalize_stimulus_name("Dead Poets (Audio)"),
        normalize_stimulus_name("The Feynmann Series - Beauty (Audio)"): normalize_stimulus_name("Feynman (Audio)"),
        normalize_stimulus_name("The Feynmann Series - Beauty"): normalize_stimulus_name("Feynman (Audio)"),
        normalize_stimulus_name("Be Kind (Everything Everywhere All At Once)"): normalize_stimulus_name("Be Kind"),
        normalize_stimulus_name("Interstellar with Hans Zimmer"): normalize_stimulus_name("Interstellar"),
        normalize_stimulus_name("Amelie - Helping a Blind Man"): normalize_stimulus_name("Amelie"),
        normalize_stimulus_name("Muhammed Ali"): normalize_stimulus_name("Muhammad Ali"),
        normalize_stimulus_name("A Perfect Planet"): normalize_stimulus_name("Perfect Planet"),
        normalize_stimulus_name("Italians Making Music on Balconies Under Coronavirus Quarantine"): normalize_stimulus_name("Italy Balconies"),
        normalize_stimulus_name("Won't You Be My Neighbor? (Mr. Rogers Documentary)"): normalize_stimulus_name("Mr Rogers Doc"),
        normalize_stimulus_name("Rocky - Retrospective"): normalize_stimulus_name("Rocky"),
        normalize_stimulus_name("Sigur Rs - Hopppolla (Audio)"): normalize_stimulus_name("Sigur Ros - Hoppipolla (Audio)"),
        normalize_stimulus_name("Carl Sagan Pale Blue Dot (Audio)"): normalize_stimulus_name("Pale Blue Dot (Audio)"),
        normalize_stimulus_name("Remembering the Titans"): normalize_stimulus_name("Remember the Titans"),
        normalize_stimulus_name("Unsung Hero"): normalize_stimulus_name("Unsung Hero (Thai Insurance)"),
        normalize_stimulus_name("Jason Silva - Existential Bummer (Audio)") : normalize_stimulus_name("Jason Silva (Audio)"),
        normalize_stimulus_name("Dead Poet's Society - YAWP Scene	") : normalize_stimulus_name("Dead Poets (Audio)"),
        normalize_stimulus_name("Giving Is The Best Communication	") : normalize_stimulus_name("Thai Medicine	"),

    }

    # Step 6: Apply the manual mapping to the main dataframe
    main_df['Stimulus name'] = main_df['Stimulus name'].apply(lambda x: manual_mapping.get(x, x))

    # Optional: Print out the unique stimulus names in the main dataframe for debugging
    print("Unique stimulus names in main_df after manual mapping:")
    print(main_df['Stimulus name'].unique())

    # Step 7: Merge the aggregated chills data with the main dataframe
    df_merged = main_df.merge(df_chills_avg, on='Stimulus name', how='left')

    return df_merged

# Example usage:
df_final = process_and_merge_chills_data(chills_db,  df)



Unique stimulus names in main_df after manual mapping:
['agnus dei audio' 'miserere me audio' '3rd grade drop out audio'
 'unbroken audio' 'laughing heart audio' 'hallelujah choir audio'
 'jason silva audio' 'clair de lune audio' 'pale blue dot audio'
 'motorcycle diaries audio' 'pema chodron audio' 'duo des fleurs audio'
 'radiohead reckoner audio' 'sigur ros hoppipolla audio'
 'wild geese audio' 'great dictator audio'
 'think too much feel too little' 'aramaic choir' 'dead poets audio'
 'feynman audio' 'air france' 'be kind' 'interstellar'
 'mr rogers testimony' 'hunger games' 'cloud atlas' 'a thing about life'
 'remember the titans' 'amelie' 'thai medicine'
 'unsung hero thai insurance' 'muhammad ali' 'perfect planet'
 'italy balconies' 'mr rogers doc' 'hans zimmer time'
 'final battle scene jurassic world' 'rocky' 'great dictator']


In [None]:
df_final['lacks_punctuation'] = df_final['lacks_punctuation'].astype(int)
# We have removed the hunger games and jurassic world scenes from the final df because we cannot be sure which media is mapped.
df_final.to_csv('text_df.csv', index = False)

In [None]:
df_final

Unnamed: 0,Stimulus name,Description,URL,video_id,transcript,music,transcript_clean,word_count,sentence_count,tb_sentiment,...,topic_1,topic_2,topic_3,topic_4,topic_5,transcript_sentences,lacks_punctuation,Chills?,#Chills,Chills Intensity
0,agnus dei audio,The Flemish Radio Choir performs Samuel Barber...,https://youtu.be/bFnbGevBnvY,bFnbGevBnvY,,True,,1,1,0.0,...,0.5885761,0.0,0.0,0.0,0.0,[nan],1,0.538462,2.25641,35.192308
1,miserere me audio,Tenebrae Choir performs Gregorio Allegri’s Mis...,https://youtu.be/3nakMFiPB0w,3nakMFiPB0w,,True,,1,1,0.0,...,0.5885761,0.0,0.0,0.0,0.0,[nan],1,0.445946,2.054054,32.432432
2,3rd grade drop out audio,"Rick Rigsby is an ordained minister, motivatio...",https://youtu.be/Yu23MU4vsIM,Yu23MU4vsIM,the wisest person I ever met in my life a thir...,False,the wisest person I ever met in my life a thir...,1618,1,0.144687,...,0.0,0.372883,0.0,0.0,0.0,"[the wisest person I ever met in my life a, th...",1,0.532468,2.181818,35.363636
3,unbroken audio,"This motivational compilation, from a series b...",https://youtu.be/QRE2CUZxtQY,QRE2CUZxtQY,you can't connect the dots looking forward you...,False,you cant connect the dots looking forward you ...,742,1,0.116383,...,0.0,0.33759,0.037944,0.0,0.018957,[you cant connect the dots looking forward you...,1,0.613333,2.306667,39.906667
4,laughing heart audio,The Laughing Heart is a classic poem by Charle...,https://youtu.be/9COXybhp8p8,9COXybhp8p8,thank you life is your life [Music] don't let ...,False,thank you life is your life Music dont let it ...,102,1,0.24,...,0.0,0.159285,0.054683,0.0,0.0,[thank you life is your life Music dont let it...,1,0.416667,1.833333,28.305556
5,hallelujah choir audio,Choir! Choir! Choir! began as a weekly drop-in...,https://youtu.be/gCrUi_tRN8g,gCrUi_tRN8g,[Verse 1]\nNow I've heard there was a secret c...,False,Verse 1 Now Ive heard there was a secret chord...,365,1,0.036758,...,6.0964e-25,0.056484,0.0,1.7e-05,0.002581,[Verse 1 Now Ive heard there was a secret chor...,1,0.756757,2.608108,51.824324
6,jason silva audio,Storyteller Jason Silva considers the imperman...,https://youtu.be/Lz-P3WdIHvw,Lz-P3WdIHvw,foreign [Music] essay written by Sigmund Freud...,False,foreign Music essay written by Sigmund Freud c...,372,1,0.217677,...,0.0,0.320493,0.0,0.0,0.095922,[foreign Music essay written by Sigmund Freud ...,1,0.561644,2.09589,33.671233
7,clair de lune audio,"Claude Debussy’s “Clair de Lune,” (Suite berga...",https://youtu.be/JRinyHJ_9-E,JRinyHJ_9-E,,True,,1,1,0.0,...,0.5885761,0.0,0.0,0.0,0.0,[nan],1,0.441558,1.974026,29.298701
8,pale blue dot audio,"On Feb. 14, 1990, astronomer Carl Sagan gave a...",https://youtu.be/T2Qv_Vms-Yw,T2Qv_Vms-Yw,Look again at that dot. That's here. That's ho...,False,Look again at that dot. Thats here. Thats home...,743,43,0.084726,...,7.165726e-26,0.1171,0.001299,0.093933,0.004911,"[Look again at that dot., Thats here., Thats h...",0,0.434783,2.072464,26.927536
9,motorcycle diaries audio,De Ushuaia a La Quiaca (From Ushuaia to La Qui...,https://youtu.be/D95hQkiRNrQ,D95hQkiRNrQ,,True,,1,1,0.0,...,0.5885761,0.0,0.0,0.0,0.0,[nan],1,0.338028,1.676056,16.676056


# Model-Building

### Imports

In [None]:
df_final = pd.read_csv("text_df.csv")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
import xgboost


## Baseline Model -- Traditional Regression on Traditional Text Features

The goal of this is to establish a baseline level of performance of regression on the "Chills Intensity" Variable. Tuning and building the final model will be done at a later stage.

### T-T Split

In [None]:
# Prep our dataset for our SLR

df_slr = df_final.drop(['Stimulus name', 'Description ', 'URL', 'video_id', 'transcript',
       'music', 'transcript_clean', 'transcript_sentences'], axis = 1)

In [None]:
X = df_slr.drop(columns=[ 'Chills Intensity', 'Chills?', '#Chills'])  # drop or keep what's relevant
y = df_slr['Chills Intensity']  # or 'Chills Intensity'

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## SLR

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Mean Squared Error: 107.75847539456734
R-squared: -4.109205192714991


## Random Forest

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Mean Squared Error: 52.69112451914716
R-squared: -1.4982700063032914


### XGBoost

In [None]:
model = xgboost.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))

Mean Squared Error: 75.69781202569912
R-squared: -2.5890973110256494


### 5-Fold CV using RFR

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(RandomForestRegressor(), X, y, scoring='neg_mean_squared_error', cv=kf)
print("Mean CV MSE:", -scores.mean())


Mean CV MSE: 46.89042509336822


Takeaway: predicting chills intensity given our dataset does not seem to be a great idea. We perhaps will get better results given a change of variable into classification!

## Binary Classification Task -- Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split

# Create a copy of the DataFrame to avoid modifying the original
df_log = df_slr.copy()

# Convert 'Chills?' into a binary classification label (1 if >= 0.5, otherwise 0)
df_log['chills_binary'] = (df_log['Chills?'] >= 0.5).astype(int)

# Define features (X) and target variable (y)
X = df_log.drop(columns=['chills_binary', 'Chills Intensity', 'Chills?', '#Chills'])  # Drop non-relevant or target-related columns
y = df_log['chills_binary']  # Binary target variable (can alternatively use 'Chills Intensity')

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42  # Set random_state for reproducibility
)


### 5-Fold CV, Logistic Regression

In [None]:
kf = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(LogisticRegression(max_iter = 1000), X, y, scoring='accuracy', cv=kf)
print(scores)
print("Mean CV Accuracy:", scores.mean())

[0.77777778 0.75       0.625      0.75       0.5       ]
Mean CV Accuracy: 0.6805555555555556
