#  Blues Clues TIPS Analysis

### This is a notebook for analysis of the TIPS dataset provided by the Institute for Advanced Analytics

In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
# Remove start and end_tokens when looking at distribution

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\edbak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\edbak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
df = pd.read_csv('MSA2024_TIPS_Data.csv')
df.head()

Unnamed: 0,TIP #1,TIP #2,TIP #3,Anything else you want to mention? [this can be more TIPS if you are overflowing with advice],Year
0,Keep in mind before the class: where have we b...,Remember to clean the kitchen,,,2023
1,prioritize life outside of the IAA so you don'...,,start the data visualization project way earli...,,2023
2,"When they say ""trust the process,"" you actuall...",you are expected to maintain an A/B grade poin...,"Fall 2 is the most stressful, fast-paced semes...",Don't worry about the TIPS project so much. It...,2023
3,when you are interested in an analytic group ...,If you are coming into the program after havin...,"Depending on your cohort, you may or may not m...","Put your mental health and relationships, abov...",2023
4,Take good care of yourself. Getting adequate s...,"Don't be shy to ask the faculty, classmates, T...",Hang out with your classmates during lunch or ...,It's useful to have prior knowledge of Python ...,2023


# Text pre-processing

In [50]:
# Text pre-processing, remove stop words etc, lower case
# Plot Unigram, bigram, trigram frequencies for each year
# IMPORTANT: A 'token' is just a word. We call them tokens because not every item in a sentence is an english word... i.e.
# numbers, symbols, etc. When I split things into tokens, I am just breaking up the bigger string

# data structure looks like this
# years = {2016->2023}
# each year is list of lists containing the text tokens
# Replace the NaNs with empty string
df=df.replace(to_replace = np.nan, value = " ") 

In [51]:
# intializing our years dictionary with empty lists that will contain sublists of TIPS as tokens
years = {2016: [], 2017: [], 2018: [], 2019: [], 2020: [], 2021: [], 2022: [], 2023: []}

# for every row
for i in range(len(df)):
    # and every column excluding the years
    for j in range(len(df.columns) - 1):
        # get the tips as lower case tokens
        tips = nltk.word_tokenize(df.iloc[i,j].lower())
        # remove stopwords and punctuation before adding
        # NOTE: consider not removing stopwords, as it doesn't read very well, see output below
        stop_words = stopwords.words('english')
        clean_tips = [word for word in tips if word not in stop_words and word not in string.punctuation and word != "n't" and word != "'re" and word != "'s"]
        # add the tips to the dictionary, only if non-empty
        if(clean_tips != []):
            years[df.iloc[i,4]].append(clean_tips)

# print(years)

# Getting the unigrams
#### (uncomment the print at the end to see the map)

In [52]:
# Obtain Unigram counts
unigram_map = {2016: {}, 2017: {}, 2018: {}, 2019: {}, 2020: {}, 2021: {}, 2022: {}, 2023: {}}

# go through every year
for year in years:
    
    # get unigrams
    unigrams = [word for tips in years[year] for word in tips]
    
    # get distribution of unigrams
    fdist = FreqDist(unigrams)

    # total number of unigrams
    total_unigrams = fdist.N()

    # convert counts to probablities
    for unigram, frequency in fdist.items():
        prob = frequency / total_unigrams
        # add probs to bigram map
        unigram_map[year][unigram] = prob
        
# now we have a unigram probability distribution for every word, for every year, saved in this unigram map!
# print(unigram_map)

### Creating distribution graph

In [55]:
for year, word_freqs in unigram_map.items():
    sorted_freqs = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)
    top_10_words = [word for word, freq in sorted_freqs[:10]]
    print(f"Year {year}: Top 10 words by frequency are {top_10_words}")

Year 2016: Top 10 words by frequency are ['get', 'time', 'learn', 'take', 'make', 'program', 'job', 'work', 'help', 'much']
Year 2017: Top 10 words by frequency are ['time', 'get', 'work', 'program', 'make', 'team', 'take', 'learn', 'try', 'people']
Year 2018: Top 10 words by frequency are ['time', 'get', 'help', 'learn', 'make', 'team', 'people', 'take', 'try', 'great']
Year 2019: Top 10 words by frequency are ['get', 'program', 'work', 'take', 'learn', 'interviews', 'time', 'good', 'practicum', 'know']
Year 2020: Top 10 words by frequency are ['time', 'get', 'program', 'people', 'make', 'work', 'take', 'know', 'classmates', 'things']
Year 2021: Top 10 words by frequency are ['program', 'get', 'time', 'make', 'take', 'work', 'help', '’', 'learn', 'people']
Year 2022: Top 10 words by frequency are ['time', 'get', 'program', 'take', 'make', 'work', 'go', 'job', 'help', 'know']
Year 2023: Top 10 words by frequency are ['time', 'program', 'make', 'get', 'work', 'people', 'friends', 'fall'

In [56]:
# insert start and end tokens to know where one tip ends and the next begins
for year in years:
    for tips in years[year]:
        tips.insert(0,"start_token")
        tips.append("end_token")

# Getting the bigrams
#### (uncomment the print at the end to see the map)

In [64]:
# Obtain Bigram counts

# creating the bigram map
bigram_map = {2016: {}, 2017: {}, 2018: {}, 2019: {}, 2020: {}, 2021: {}, 2022: {}, 2023: {}}

# go through every year
for year in years:
    
    # and flatten the arrays
    words = [word for tips in years[year] for word in tips]
    
    # get bigrams
    bigrams = list(nltk.bigrams(words))
    
    # get distribution of bigrams
    fdist = FreqDist(bigrams)

    # total number of bigrams
    total_bigrams = fdist.N()

    # convert counts to probablities
    for bigram, frequency in fdist.items():
        prob = frequency / total_bigrams
        # add probs to bigram map
        bigram_map[year][bigram] = prob

# now we have a bigram probability distribution for every word, for every year, saved in this bigram map
    
# print(bigram_map)

In [66]:
for year, word_freqs in bigram_map.items():
    sorted_freqs = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)
    top_10_words = [word for word, freq in sorted_freqs[:15]]
    print(f"Year {year}: Top 10 words by frequency are {top_10_words}")

Year 2016: Top 10 words by frequency are [('end_token', 'start_token'), ('start_token', 'take'), ('start_token', 'learn'), ('program', 'end_token'), ('make', 'sure'), ('take', 'time'), ('fall', 'semester'), ('job', 'search'), ('start_token', 'make'), ('start_token', 'keep'), ('start_token', 'try'), ('start_token', 'trust'), ('r', 'python'), ('get', 'know'), ('start_token', 'apply')]
Year 2017: Top 10 words by frequency are [('end_token', 'start_token'), ('start_token', 'take'), ('make', 'sure'), ('start_token', 'try'), ('get', 'know'), ('take', 'time'), ('program', 'end_token'), ('job', 'search'), ('work', 'end_token'), ('try', 'get'), ('start_token', 'make'), ('take', 'advantage'), ('interview', 'season'), ('process', 'end_token'), ('start_token', 'start')]
Year 2018: Top 10 words by frequency are [('end_token', 'start_token'), ('trust', 'process'), ('practicum', 'team'), ('start_token', 'trust'), ('start_token', 'get'), ('start_token', 'take'), ('get', 'know'), ('open', 'source'), ('

# Getting the trigrams
#### (uncomment the print at the end to see the map)

In [None]:
# Obtain Trigram counts

# creating the trigram map
trigram_map = {2016: {}, 2017: {}, 2018: {}, 2019: {}, 2020: {}, 2021: {}, 2022: {}, 2023: {}}

# go through every year
for year in years:
    
    # and flatten the arrays
    words = [word for tips in years[year] for word in tips]
    
    # get trigrams
    trigrams = list(nltk.trigrams(words))
    
    # get distribution of trigrams
    fdist = FreqDist(trigrams)

    # total number of trigrams
    total_trigrams = fdist.N()

    # convert counts to probablities
    for trigram, frequency in fdist.items():
        prob = frequency / total_trigrams
        # add probs to trigram map
        trigram_map[year][trigram] = prob

# now we have a trigram probability distribution for every word, for every year, saved in this trigram map!
# print(trigram_map)
