# 1. Install and load all necessary packages

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
import spacy

# Ensure you have the necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

# 2. Getting Text into R/Python

In [149]:
# We load data (a csv-file with ratings and content of TV series) from the Github repository
url = "https://raw.githubusercontent.com/valeriehase/Salamanca-CSS-SummerSchool/main/Processing%20text%20and%20text%20as%20data/data_tvseries.csv"
data = pd.read_csv(url, sep = ";")

In [None]:
#Check data by inspecting first rows via head()
data.head()

In [None]:
# Inspect data in variable "Year" for first observation - any issues?
data.iloc[0, 1]

# 3. Cleaning/Normalizing Text



## Cleaning Text via Regular Expressions



In [None]:
# Let's remove the number, point and blank space before the TV series in our
# variable "Title" using replace()
data["Title"] = data["Title"].replace("^[0-9]+\.", "", regex = True)

#Inspect the result
data.head()

In [None]:
# Ok, let's have some fun with this.
# Using the str.contains() function, we identify all TV series
# that contain the word "drama" in the variable "Description".
data[data["Description"].str.contains("[D|d]rama")].head()

In [None]:
#Let's get all observations that contain the word
# "drama" or the word "crime" in the variable "Description"
data[data["Description"].str.contains("[D|d]rama|[C|c]rime")].head()

In [None]:
# Your turn!
# Can you identify all series that play in Spain?
data[data["Description"].str.contains("in Spain")]

In [None]:
# Your turn!
# Can you identify all series that deal with superheroes # and replace the term "superhero/superheroes in the variable "Description"
# with "fancy Python programmers"?
data["Description"].str.replace("[S|s]uperhero[es]* ", "fancy Python programmers", regex = True).head()

## Normalizing Text



In [156]:
# Initialize the stop words and stemmer
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

#Write a function that contains all necessary preprocessing steps
def clean_description(description):
    # Tokenize the description
    words = word_tokenize(description)
    # Remove special signs and convert to lower case
    words = [word.lower() for word in words if word.isalpha()]
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    return words

tokens = [clean_description(description) for description in data["Description"]]

In [None]:
#Look at original first text
data["Description"].iloc[0]

In [None]:
#Look at preprocessed first text
tokens[0]

In [159]:
# Your turn!
# Can you create a list of 3-5 stop words that you think are unique to this corpus
# and remove these as part of the existing preprocessing pipeline?
unique_stopwords = ["one", "two", "three", "four", "five"]

#Write a function that contains all necessary preprocessing steps
def clean_description(description):
    # Tokenize the description
    words = word_tokenize(description)
    # Remove special signs and convert to lower case
    words = [word.lower() for word in words if word.isalpha()]
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Remove unique list of stopwords
    words = [word for word in words if word not in unique_stopwords]
    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    return words

tokens = [clean_description(description) for description in data["Description"]]

# Text-as-Data Representations



## Bag-of-words approach: Document-feature matrix

In [None]:
#Write a new dfm function that contains all necessary preprocessing steps
def clean_description_dfm(description):
    # Tokenize the description
    words = word_tokenize(description)
    # Remove special signs and convert to lower case
    words = [word.lower() for word in words if word.isalpha()]
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Apply stemming
    words = [stemmer.stem(word) for word in words]
    #Additionally re-join as string
    return ' '.join(words)  # Join the tokens back into a single string

tokens_dfm = [clean_description_dfm(description) for description in data["Description"]]

#Create a document-feature matrix
vectorizer = CountVectorizer()
dfm = vectorizer.fit_transform(tokens_dfm)

#print the result in dense format
pd.DataFrame(dfm.todense(), columns = vectorizer.get_feature_names_out()).head()

In [None]:
# Convert dfm to a dense format for calculation
dfm_dense = dfm.toarray()

# Get feature names
feature_names = vectorizer.get_feature_names_out()

#Check most frequent features
def top_features(matrix, feature_names, top_n):
    # Sum the occurrences of each feature
    feature_sums = np.sum(matrix, axis = 0)
    # Create a data frame to hold feature names and their corresponding sums
    feature_sums_df = pd.DataFrame({'feature': feature_names, 'count': feature_sums})
    # Sort the data frame by count in descending order and get the top N features
    top_features_df = feature_sums_df.sort_values(by = "count", ascending = False).head(top_n)
    return top_features_df

topfeatures = top_features(dfm_dense, feature_names, 10)

topfeatures

In [None]:
#Visualize results with a word cloud

#get feature sums
feature_sums = np.sum(dfm_dense, axis=0)

# Create a dictionary of features and their corresponding sums
feature_counts = dict(zip(feature_names, feature_sums))

# Generate a word cloud
wordcloud = WordCloud(max_words = 100, background_color = "white").generate_from_frequencies(feature_counts)

# Display the word cloud using matplotlib
plt.figure(figsize = (10, 5))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

## Beyond bag-of-words: Ngrams

In [None]:
# Flatten the list of lists into a single list of tokens
all_tokens = [token for sublist in tokens for token in sublist]

# Find bigram collocations
finder = BigramCollocationFinder.from_words(all_tokens)

# Filter out bigrams that occur less than 10 times
finder.apply_freq_filter(10)

# Score the bigrams using the likelihood ratio
scored = finder.score_ngrams(BigramAssocMeasures.likelihood_ratio)

# Convert to a DataFrame for easier manipulation
scored_df = pd.DataFrame(scored, columns = ["bigram", "likelihood_ratio"])

# Sort by the likelihood ratio in descending order and take the top 10
top_10_collocations = scored_df.sort_values(by = "likelihood_ratio", ascending=False).head(10)

# Print the top 10 collocations
top_10_collocations

## Beyond bag-of-words: Part-of-speech tagging

In [None]:
# For simplicity, run for fewer documents
sample = data.head(1)

# Part-of-speech tagging, include only related variables
pos_tags = []
for idx, row in sample.iterrows():
    doc = nlp(row["Description"])
    for sent in doc.sents:
        for token in sent:
            pos_tags.append({
                'sentence_id': sent.start,
                'token_id': token.i,
                'token': token.text,
                'upos': token.pos_
            })

# Convert the list of dictionaries to a DataFrame
pos_df = pd.DataFrame(pos_tags)

# Display the first 10 rows
pos_df.head(10)