# Intaking txt files and cleaning + tokenizing

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.tokenize import TweetTokenizer
with open("raw_marrietta.txt", "r") as file:
    full_corpus_text = file.read()
# cleaning, replacing quotations with the ones recognized by the tokenizor, removing punctuation 
full_corpus_text = full_corpus_text.replace("‚Äô", "'")
full_corpus_text = (
    full_corpus_text.replace("‚Äô", "'")
        .replace("‚Äò", "'")
        .replace("‚Äú", '"')
        .replace("‚Äù", '"')
)


punct = set(string.punctuation)
# using tweet tokenizer because it is useful for short texts (like reddit posts)
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def tokenize_twitter(text):
    tokens = tknzr.tokenize(text)
    return tokens

tokens = tokenize_twitter(full_corpus_text)
tokens = [t for t in tokens if t not in string.punctuation and t != "‚Ä¶"]
print(tokens)


['https://www.reddit.com/r/Marietta/comments/173q5sh/anyone_here_live_off_franklin_road_in_the_late/', 'anyone', 'here', 'live', 'off', 'franklin', 'road', 'in', 'the', 'late', '90s', 'early', '2000s', 'preston', 'chase', 'apartments', 'specifically', 'i', 'lived', 'in', 'franklin', 'park', 'in', '93', 'my', 'first', 'ever', 'apartment', '420', 'mo', 'for', 'a', '1', 'bedroom', 'clean', 'but', 'somewhat', 'sketchy', 'i', 'had', '5', 'pairs', 'of', 'jeans', 'stolen', 'from', 'the', 'laundry', 'which', 'was', 'devastating', 'at', '23', 'years', 'old', 'the', 'complex', 'was', 'leveled', 'and', "it's", 'now', 'the', 'atlanta', 'united', 'training', 'complex', 'then', 'i', 'lived', 'at', 'st', 'augustine', 'place', 'for', 'several', 'years', 'i', 'was', 'living', 'the', 'dream', 'yeah', 'all', 'those', 'apartments', 'seemed', 'to', 'be', 'built', 'for', 'life', 'university', 'i', 'could', 'be', 'wrong', 'when', 'i', 'lived', 'there', 'as', 'a', 'kid', 'it', 'was', 'predominantly', 'mexican

In [18]:
from nltk.collocations import BigramCollocationFinder

finder = BigramCollocationFinder.from_words(tokens)


In [19]:
# this makes sure the bigrams that are scored occour in the text at least twice

finder.apply_freq_filter(2)  # Minimum 3 occurrences recommended


# computing the PMI scores

In [20]:
from nltk.collocations import BigramAssocMeasures
# pmi score
bigram_measures = BigramAssocMeasures()

pmi_scored = finder.score_ngrams(bigram_measures.pmi)
print(pmi_scored)


[(('preston', 'chase'), 11.3409627642517), (('spray', 'berry'), 11.3409627642517), (('st', 'augustine'), 11.3409627642517), (('terrell', 'mill'), 11.3409627642517), (('united', 'training'), 11.3409627642517), (('door', 'open'), 10.756000263530543), (('sandy', 'plains'), 10.756000263530543), (('test', 'scores'), 10.756000263530543), (('windy', 'hill'), 10.756000263530543), (('break', 'ins'), 10.3409627642517), (('improved', 'greatly'), 10.019034669364338), (('resale', 'value'), 10.019034669364338), (('property', 'values'), 9.756000263530543), (('shopping', 'center'), 9.756000263530543), (('90', 's'), 9.533607842194096), (('feel', 'unsafe'), 9.533607842194096), (('take', 'care'), 9.533607842194096), (('after', 'moving'), 9.43407216864318), (('town', 'center'), 9.3409627642517), (('low', 'crime'), 9.171037762809387), (('next', 'year'), 9.171037762809387), (('run', 'down'), 9.171037762809387), (('slows', 'down'), 9.171037762809387), (('drive', 'through'), 9.019034669364338), (('further', '

In [21]:
# pmi_scored is a list of tuples: [ ((w1, w2), score), ... ]

# extract scores
scores = [score for (_, score) in pmi_scored]

min_s = min(scores)
max_s = max(scores)

def scale_score(s):
    return 20 * (s - min_s) / (max_s - min_s)

# apply scaling
scaled_pmi = [ (bigram, scale_score(score)) for (bigram, score) in pmi_scored ]


# saving scores to csv

In [22]:
import csv

# pmi_scored is a list like [(('west', 'end'), 8.18), (('english', 'ave'), 7.77), ...]
with open("bigrams_pmi.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["word1", "word2", "PMI_score"])
    for (w1, w2), score in scaled_pmi:
        writer.writerow([w1, w2, score])

print("Saved all bigrams and their PMI scores to bigrams_pmi.csv")

Saved all bigrams and their PMI scores to bigrams_pmi.csv


# doing the same for trigrams

In [23]:
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures

# Create trigram finder
trigram_finder = TrigramCollocationFinder.from_words(tokens)

# Filter out rare trigrams (min freq = 2)
trigram_finder.apply_freq_filter(2)

# Scoring method
trigram_measures = TrigramAssocMeasures()

# Get PMI scores
trigram_pmi_scored = trigram_finder.score_ngrams(trigram_measures.pmi)
print(trigram_pmi_scored)

# Extract scores
tri_scores = [score for (_, score) in trigram_pmi_scored]

min_tri = min(tri_scores)
max_tri = max(tri_scores)

def scale_trigram_score(s):
    return 20 * (s - min_tri) / (max_tri - min_tri)

# Scale scores
scaled_trigram_pmi = [
    (trigram, scale_trigram_score(score))
    for (trigram, score) in trigram_pmi_scored
]

# Save to CSV
with open("trigrams_pmi.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["word1", "word2", "word3", "PMI_score"])
    for (w1, w2, w3), score in scaled_trigram_pmi:
        writer.writerow([w1, w2, w3, score])

print("Saved all trigrams and their PMI scores to trigrams_pmi.csv")


[(('terrell', 'mill', 'road'), 20.222493909866103), (('st', 'augustine', 'place'), 19.874570606445793), (('car', 'break', 'ins'), 19.096963027782245), (('has', 'improved', 'greatly'), 18.190072432173725), (('hoping', 'it', 'slows'), 17.39652330964115), (('spray', 'berry', 'is'), 17.378144780326295), (('has', 'gotten', 'worse'), 16.190072432173725), (('further', 'into', 'east'), 16.038069338728675), (('at', 'all', 'hours'), 15.894022969111967), (('take', 'care', 'of'), 15.607784065750893), (('kind', 'of', 'ghetto'), 15.50824839219998), (("you'll", 'be', 'fine'), 15.38271751011612), (('it', 'slows', 'down'), 15.226598308198838), (('to', 'take', 'care'), 14.9557073691712), (('sprayberry', 'high', 'school'), 14.593137289786492), (('sold', 'my', 'house'), 14.589168387583547), (('as', 'long', 'as'), 14.423359494613468), (('grew', 'up', 'there'), 14.352129190282698), (("don't", 'have', 'kids'), 14.119683104282329), (('cobb', 'snob', 'thing'), 14.052568908423789), (('decent', 'place', 'to'), 1