#  Blues Clues TIPS Analysis

### This is a notebook for analysis of the TIPS dataset provided by the Institute for Advanced Analytics

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.probability import FreqDist

In [None]:
df = pd.read_csv('MSA2024_TIPS_Data.csv')
df.head()

# Text pre-processing

In [None]:
# Text pre-processing, remove stop words etc, lower case
# Plot Unigram, bigram, trigram frequencies for each year
# IMPORTANT: A 'token' is just a word. We call them tokens because not every item in a sentence is an english word... i.e.
# numbers, symbols, etc. When I split things into tokens, I am just breaking up the bigger string

# data structure looks like this
# years = {2016->2023}
# each year is list of lists containing the text tokens
# Replace the NaNs with empty string
df=df.replace(to_replace = np.nan, value = " ") 

In [None]:
# intializing our years dictionary with empty lists that will contain sublists of TIPS as tokens
years = {2016: [], 2017: [], 2018: [], 2019: [], 2020: [], 2021: [], 2022: [], 2023: []}

# for every row
for i in range(len(df)):
    # and every column excluding the years
    for j in range(len(df.columns) - 1):
        # get the tips as lower case tokens
        tips = nltk.word_tokenize(df.iloc[i,j].lower())
        # remove stopwords and punctuation before adding
        # NOTE: consider not removing stopwords, as it doesn't read very well, see output below
        stop_words = stopwords.words('english')
        clean_tips = [word for word in tips if word not in stop_words and word not in string.punctuation]
        # add the tips to the dictionary
        years[df.iloc[i,4]].append(clean_tips)

# insert start and end tokens to know where one tip ends and the next begins
for year in years:
    for tips in years[year]:
        tips.insert(0,"start_token")
        tips.append("end_token")

# Getting the unigrams
#### (uncomment the print at the end to see the map)

In [None]:
# Obtain Unigram counts
unigram_map = {2016: {}, 2017: {}, 2018: {}, 2019: {}, 2020: {}, 2021: {}, 2022: {}, 2023: {}}

# go through every year
for year in years:
    
    # get unigrams
    unigrams = [word for tips in years[year] for word in tips]
    
    # get distribution of unigrams
    fdist = FreqDist(unigrams)

    # total number of unigrams
    total_unigrams = fdist.N()

    # convert counts to probablities
    for unigram, frequency in fdist.items():
        prob = frequency / total_unigrams
        # add probs to bigram map
        unigram_map[year][unigram] = prob
        
# now we have a unigram probability distribution for every word, for every year, saved in this unigram map!
# print(unigram_map)

# Getting the bigrams
#### (uncomment the print at the end to see the map)

In [None]:
# Obtain Bigram counts

# creating the bigram map
bigram_map = {2016: {}, 2017: {}, 2018: {}, 2019: {}, 2020: {}, 2021: {}, 2022: {}, 2023: {}}

# go through every year
for year in years:
    
    # and flatten the arrays
    words = [word for tips in years[year] for word in tips]
    
    # get bigrams
    bigrams = list(nltk.bigrams(words))
    
    # get distribution of bigrams
    fdist = FreqDist(bigrams)

    # total number of bigrams
    total_bigrams = fdist.N()

    # convert counts to probablities
    for bigram, frequency in fdist.items():
        prob = frequency / total_bigrams
        # add probs to bigram map
        bigram_map[year][bigram] = prob

# now we have a bigram probability distribution for every word, for every year, saved in this bigram map!
# print(bigram_map)

# Getting the trigrams
#### (uncomment the print at the end to see the map)

In [None]:
# Obtain Trigram counts

# creating the trigram map
trigram_map = {2016: {}, 2017: {}, 2018: {}, 2019: {}, 2020: {}, 2021: {}, 2022: {}, 2023: {}}

# go through every year
for year in years:
    
    # and flatten the arrays
    words = [word for tips in years[year] for word in tips]
    
    # get trigrams
    trigrams = list(nltk.trigrams(words))
    
    # get distribution of trigrams
    fdist = FreqDist(trigrams)

    # total number of trigrams
    total_trigrams = fdist.N()

    # convert counts to probablities
    for trigram, frequency in fdist.items():
        prob = frequency / total_trigrams
        # add probs to trigram map
        trigram_map[year][trigram] = prob

# now we have a trigram probability distribution for every word, for every year, saved in this trigram map!
# print(trigram_map)
