In [165]:
import nltk
import pandas as pd
df = pd.read_excel('posts_first_targil.xlsx', sheet_name=None)

In [166]:
for sheet_name, sheet_df in df.items():
    print(f"Sheet name: {sheet_name}")
    print(sheet_df.columns)

Sheet name: A-J
Index(['sub_title', 'date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: BBC
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: J-P
Index(['date', 'Newspaper', 'Body', 'title'], dtype='object')
Sheet name: NY-T
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')


In [167]:
# we're renaming J-P column to match others
df['J-P'].rename(columns={'Body': 'Body Text'}, inplace=True)

In [168]:
for sheet_name, sheet_df in df.items():
    print(f"Sheet name: {sheet_name}")
    print(sheet_df.columns)

Sheet name: A-J
Index(['sub_title', 'date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: BBC
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: J-P
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: NY-T
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')


In [169]:
import re

In [170]:
def clean_text(text):
    pattern = r"((?<!\w)[^\s\w]|[^\s\w](?!\w))"

    cleaned_text = re.sub(pattern, r" \1 ", text)

    # Replace multiple spaces with a single space
    return re.sub(r"\s+", " ", cleaned_text).strip()

In [174]:
for sheet_name, sheet_df in df.items():
    sheet_df = sheet_df.map(lambda x: clean_text(x) if isinstance(x, str) else x)
    sheet_df.to_csv(f'clean_data\\{sheet_name}.csv', index=False)

2. Lemmatization

In [215]:
from nltk import word_tokenize, WordNetLemmatizer



[nltk_data] Downloading package stopwords to C:\Users\Yoni
[nltk_data]     Klein\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [209]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)  # Tokenize text
    return " ".join([lemmatizer.lemmatize(word) for word in words])

In [210]:
import os

for file in os.listdir('./clean_data'):
    print(file)
    sheet = pd.read_csv(f'clean_data\\{file}')
    sheet = sheet.map(lambda x: lemmatize_text(x) if isinstance(x, str) else x)
    sheet.to_csv(f"lemmatize_data\\{file}", index=False)



A-J.csv
BBC.csv
J-P.csv
NY-T.csv


In [227]:
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk
from scipy.sparse import save_npz
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))



In [244]:
def save_metadata(word_index, output_file):
    # save for feature use
    # Convert the dictionary to a DataFrame
    metadata_df = pd.DataFrame(list(word_index.items()), columns=["Word", "Index"])

    # Save the DataFrame to a CSV file
    metadata_df.to_csv(output_file, index=False)
    print(f"Metadata saved to {output_file}")


In [258]:
# bm25 to the lemmatized docs

for file in os.listdir('lemmatize_data'):
    print(file)
    sheet = pd.read_csv(f'lemmatize_data\\{file}')

    def remove_stopwords(text):
        words = text.split()  # Tokenize the text
        return [word.lower() for word in words if word.lower() not in stop_words]

    # Build the corpus
    if file == 'A-J.csv':
        corpus = [
            remove_stopwords(f'{row["title"]} {row["sub_title"]} {row["Body Text"]}')
            for _, row in sheet.iterrows()
        ]
    else:
        corpus = [
            remove_stopwords(f'{row["title"]} {row["Body Text"]}')
            for _, row in sheet.iterrows()
        ]

    bm25 = BM25Okapi(corpus)
    all_words = bm25.idf.keys()
    word_index = {word: idx for idx, word in enumerate(all_words)}
    save_metadata(word_index, f'metadata\\lemma\\{file}')
    rows, cols, data = [], [], []
    for word in all_words:
        scores = bm25.get_scores(word)
        for doc_idx, score in enumerate(scores):
            if score > 0:  # Only include non-zero scores to keep it sparse
                rows.append(doc_idx)  # Document index
                cols.append(word_index[word])  # Word index
                data.append(score)  # BM25 score


    sparse_bm25_matrix = csr_matrix((data, (rows, cols)), shape=(len(corpus), len(all_words)))

    save_npz(f'bm25\\lemma\\{file.split(".")[0]}', sparse_bm25_matrix)







A-J.csv
Metadata saved to metadata\lemma\A-J.csv
BBC.csv
Metadata saved to metadata\lemma\BBC.csv
J-P.csv
Metadata saved to metadata\lemma\J-P.csv
NY-T.csv
Metadata saved to metadata\lemma\NY-T.csv


In [259]:
for file in os.listdir('clean_data'):
    print(file)
    sheet = pd.read_csv(f'clean_data\\{file}')

    def remove_stopwords(text):
        words = text.split()  # Tokenize the text
        return [word.lower() for word in words if word.lower() not in stop_words]

    # Build the corpus
    if file == 'A-J.csv':
        corpus = [
            remove_stopwords(f'{row["title"]} {row["sub_title"]} {row["Body Text"]}') for _, row in sheet.iterrows()
        ]
    else:
        corpus = [
            remove_stopwords(f'{row["title"]} {row["Body Text"]}') for _, row in sheet.iterrows()
        ]

    bm25 = BM25Okapi(corpus)
    all_words = bm25.idf.keys()
    word_index = {word: idx for idx, word in enumerate(all_words)}
    save_metadata(word_index, f'metadata\\clean\\{file}')
    rows, cols, data = [], [], []
    for word in all_words:
        scores = bm25.get_scores(word)
        for doc_idx, score in enumerate(scores):
            if score > 0:  # Only include non-zero scores to keep it sparse
                rows.append(doc_idx)  # Document index
                cols.append(word_index[word])  # Word index
                data.append(score)  # BM25 score


    sparse_bm25_matrix = csr_matrix((data, (rows, cols)), shape=(len(corpus), len(all_words)))

    save_npz(f'bm25\\clean\\{file.split(".")[0]}', sparse_bm25_matrix)

A-J.csv
Metadata saved to metadata\clean\A-J.csv
BBC.csv
Metadata saved to metadata\clean\BBC.csv
J-P.csv
Metadata saved to metadata\clean\J-P.csv
NY-T.csv
Metadata saved to metadata\clean\NY-T.csv


3: IG

In [252]:
from sklearn.feature_selection import mutual_info_classif
from scipy.sparse import load_npz
import numpy as np

IG for lemmatized

In [261]:
for file in os.listdir('bm25\\lemma'):
    # Load the sparse BM25 matrix
    sparse_bm25_matrix = load_npz(f'bm25\\lemma\\{file}')

    # Sum the BM25 scores for each word across all documents (proxy for importance)
    word_scores = np.array(sparse_bm25_matrix.sum(axis=0)).flatten()  # Sum along columns

    # Load word-to-index mapping
    word_metadata = pd.read_csv(f"metadata\\lemma\\{file.split('.')[0]}.csv")

    # Map scores to words
    df_word_scores = pd.DataFrame({
        "Word": word_metadata["Word"],
        "Score": word_scores
    }).sort_values(by="Score", ascending=False)

    # Save to CSV
    df_word_scores.to_csv(f"IG\\lemma\\{file.split('.')[0]}.csv", index=False)

    # Display top words by score
    print(df_word_scores.head())

                      Word      Score
2632  youtu.be/ilery4m9kqc 1526.50330
1367                18.5bn 1180.27864
1981                   1.4 1171.21460
3876                   1.2 1170.98122
2788                   1.5 1170.64398
           Word      Score
9506     17.4.1 4089.23210
2594  bbc.co.uk 4082.05305
9128   2,332.97 3981.15565
2574   07.09.23 3872.42854
9126   2,431.29 3648.78138
                                               Word      Score
21151                            kronishl.e.a.r.h.n 9354.87922
21779  yanshufim.giveback.co.il/our_heart_helooking 6448.80153
7649           experience.www.noayekutieli.com/ella 6420.32105
16348                            15,000-50,000.some 6290.64304
16183         www.google-analytics.com/analytics.js 6177.02794
                Word      Score
5700  3,000-year-old 1031.01485
5353     2,000-pound  988.24087
3203    35,000people  983.35589
4051         570,000  972.24843
3192           4,700  965.58694


IG for clean

In [262]:
for file in os.listdir('bm25\\clean'):
    # Load the sparse BM25 matrix
    sparse_bm25_matrix = load_npz(f'bm25\\clean\\{file}')

    # Sum the BM25 scores for each word across all documents (proxy for importance)
    word_scores = np.array(sparse_bm25_matrix.sum(axis=0)).flatten()  # Sum along columns

    # Load word-to-index mapping
    word_metadata = pd.read_csv(f"metadata\\clean\\{file.split('.')[0]}.csv")

    # Map scores to words
    df_word_scores = pd.DataFrame({
        "Word": word_metadata["Word"],
        "Score": word_scores
    }).sort_values(by="Score", ascending=False)

    # Save to CSV
    df_word_scores.to_csv(f"IG\\clean\\{file.split('.')[0]}.csv", index=False)

    # Display top words by score
    print(df_word_scores.head())

                      Word      Score
2915  youtu.be/ilery4m9kqc 1186.85553
1475                18.5bn 1183.80731
2173                   1.4 1174.60631
4311                   1.2 1174.44120
3091                   1.5 1174.09042
                                    Word      Score
2824           hello.bbclondon@bbc.co.uk 4943.49398
16246     northwest.newsonline@bbc.co.uk 4906.26510
8490           yorkslincs.news@bbc.co.uk 4902.57845
4668          south.newsonline@bbc.co.uk 4901.53774
9640   newsonline.westmidlands@bbc.co.uk 4898.11889
                                                  Word       Score
19278  a.async=1;a.src=g;m.parentnode.insertbefore(a,m 13001.33335
19266                           function(i,s,o,g,r,a,m 12224.19119
25405                               kronishl.e.a.r.h.n  9288.65067
25605   activities&rdquo;emmanuel&nbsp;macron&ldquo;in  8650.33167
16591        official&nbsp;osama&nbsp;hamdan&nbsp;said  8573.77903
                       Word      Score
6394         3,000-