In [5]:
import pandas as pd
import numpy as np
import nltk

def ensure_all_nltk_resources():
    # Attempt to find all NLTK resources
    try:
        # Check if 'all' resource is already downloaded
        nltk.data.find('all')
        print("All NLTK resources are already downloaded.")
    except LookupError:
        print("Some NLTK resources are missing. Downloading all resources...")
        nltk.download('all')

ensure_all_nltk_resources()

Some NLTK resources are missing. Downloading all resources...


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ch8765\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ch8765\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\ch8765\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\ch8765\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\ch8765\AppData\Roaming\nltk_data...
[nltk_data]    |   U

In [2]:
# import dataset
df = pd.read_csv('data\combined_data.csv')

In [3]:
df['text'].shape

(4932674,)

In [8]:
df.columns

Index(['text', 'timestamp', 'username', 'link', 'link_id', 'parent_id', 'id',
       'subreddit_id', 'moderation', 'yearmonth', 'title', 'index',
       'tokenized_text'],
      dtype='object')

In [6]:
# tokenization
from nltk.tokenize import word_tokenize
df['tokenized_text'] = df['text'].apply(word_tokenize)  

In [7]:
# remove punctuation and stopwords
from nltk.corpus import stopwords
from string import punctuation
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [word.lower() for word in x if word.lower() not in stopwords.words('english') and word.lower() not in punctuation])

In [13]:
# remove non-english words
import enchant
d = enchant.Dict("en_UK")
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [word for word in x if d.check(word)])

In [90]:
# lemmatization
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
df['lemma_text'] = df['tokenized_text'].apply(lambda x: [wnl.lemmatize(word) for word in x])

In [94]:
# join text
df['cleaned_text'] = df['lemma_text'].apply(lambda x: ' '.join(x))

In [123]:
# to csv
df.to_csv('data\combined_data_tokenized.csv', index=False)

In [96]:
# td-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_matrix = vectorizer.fit_transform(df['cleaned_text'])
#df_tfidf = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names_out()) #this is too large to be stored

In [117]:
# calculate td-idf by month
monthly_tdidf = {}
vectorizer = TfidfVectorizer(stop_words='english')
tf_matrix_all = vectorizer.fit_transform(df['cleaned_text'])

for yearmonth, group in df.groupby('yearmonth'):
    tf_matrix = vectorizer.transform(group['cleaned_text'])
    monthly_tdidf[yearmonth] = tf_matrix

In [120]:
# view monthly important words
from scipy.sparse import csr_matrix

monthly_important_words = {}
threshold = 0.005
for yearmonth, tf_matrix in monthly_tdidf.items():
    feature_names = vectorizer.get_feature_names_out()
    
    # Calculate mean without converting to dense
    mean_tfidf = tf_matrix.mean(axis=0).A1  # A1 converts it to a 1D array
    mean_tfidf_series = pd.Series(mean_tfidf, index=feature_names)  # Create a Series

    # Filter important words based on the threshold
    important_words = mean_tfidf_series[mean_tfidf_series > threshold]  # Use the Series to filter
    sorted_important_words = important_words.sort_values(ascending=False)  # Sort the Series
    monthly_important_words[yearmonth] = sorted_important_words


In [122]:
# view monthly important words
for yearmonth, important_words in monthly_important_words.items():
    print(f"Important words in {yearmonth}:")
    print(important_words.head(20))
    print("\n")

Important words in 2020-01:
deleted    0.057714
removed    0.023865
like       0.014822
people     0.010851
time       0.010198
good       0.009573
think      0.009266
gt         0.009046
know       0.008837
thanks     0.007515
really     0.007472
got        0.007280
year       0.007253
need       0.007252
day        0.007034
want       0.006888
work       0.006622
mask       0.006384
thing      0.006264
say        0.006000
dtype: float64


Important words in 2020-02:
deleted    0.061756
removed    0.023870
like       0.014732
people     0.013114
time       0.009972
think      0.009491
good       0.009123
mask       0.008855
know       0.008836
gt         0.008272
need       0.007669
day        0.007461
really     0.007424
case       0.007292
got        0.007243
thanks     0.006940
virus      0.006822
work       0.006744
thing      0.006311
want       0.006130
dtype: float64


Important words in 2020-03:
deleted    0.061997
removed    0.017020
like       0.014607
people     0.013611
ti

In [108]:
# removing words
threshold = 0.005
filtered_words = mean_tfidf[mean_tfidf > threshold]
print(len(filtered_words))

33


In [14]:
# create a new datasets, with frequency of word occurence by month
from collections import Counter
df['word_freq'] = df['tokenized_text'].apply(lambda x: Counter(x))
df['word_freq'].head()

0           {'entry': 1, 'requirements': 1, 'less': 1}
1       {'jam': 1, 'like': 1, 'barely': 1, 'worth': 1}
2    {'outside': 1, 'guess': 1, 'reaching': 1, 'aro...
3                   {'female': 1, 'male': 1, 'cry': 1}
4    {'bring': 2, 'umbrella': 1, 'gets': 1, 'really...
Name: word_freq, dtype: object

In [16]:
monthly_word_freq = {}
for yearmonth, group in df.groupby('yearmonth'):
    all_words = [word for word_freq in group['word_freq'] for word in word_freq]
    monthly_word_freq[yearmonth] = Counter(all_words)

# create a dataframe from the dictionary
df_word_freq_month = pd.DataFrame(monthly_word_freq).fillna(0).transpose()

In [17]:
df_word_freq_month.head()

Unnamed: 0,entry,requirements,less,jam,like,barely,worth,outside,guess,reaching,...,uniform-groups,"3,285,000,000",peasant-free,economically-priced,0.000000000000000000000001,8849,0472,slabber,phalli,street-shitting
2020-01,145.0,92.0,921.0,51.0,7865.0,140.0,532.0,485.0,1054.0,42.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02,157.0,102.0,1174.0,36.0,8610.0,128.0,520.0,691.0,1053.0,43.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-03,244.0,129.0,1397.0,64.0,9909.0,171.0,606.0,729.0,1370.0,48.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-04,186.0,165.0,1992.0,30.0,12839.0,176.0,690.0,1252.0,1777.0,83.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-05,214.0,157.0,1711.0,31.0,11825.0,189.0,739.0,937.0,1568.0,76.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# save to csv
df_word_freq_month.to_csv('data\word_freq_month.csv',replace=True)