In [2]:
import pandas as pd
df = pd.read_excel('posts_first_targil.xlsx', sheet_name=None)

In [3]:
for sheet_name, sheet_df in df.items():
    print(f"Sheet name: {sheet_name}")
    print(sheet_df.columns)

Sheet name: A-J
Index(['sub_title', 'date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: BBC
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: J-P
Index(['date', 'Newspaper', 'Body', 'title'], dtype='object')
Sheet name: NY-T
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')


In [4]:
# we're renaming J-P column to match others
df['J-P'].rename(columns={'Body': 'Body Text'}, inplace=True)

In [5]:
for sheet_name, sheet_df in df.items():
    print(f"Sheet name: {sheet_name}")
    print(sheet_df.columns)

Sheet name: A-J
Index(['sub_title', 'date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: BBC
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: J-P
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')
Sheet name: NY-T
Index(['date', 'Newspaper', 'Body Text', 'title'], dtype='object')


In [6]:
import re

In [7]:
def clean_text(text):
    border_pattern = r"((?<!\w)[^\s\w]|[^\s\w](?!\w))"
    # Regex pattern
    dot_pattern = r"(?<!\w)([a-zA-Z]{2,})\.([a-zA-Z]{2,})(?!\w)"

    # Apply regex substitution
    cleaned_text = re.sub(dot_pattern, r"\1 . \2", text)

    cleaned_text = re.sub(border_pattern, r" \1 ", cleaned_text)

    # Replace multiple spaces with a single space
    return re.sub(r"\s+", " ", cleaned_text).strip()

In [8]:
for sheet_name, sheet_df in df.items():
    sheet_df = sheet_df.map(lambda x: clean_text(x) if isinstance(x, str) else x)
    sheet_df.to_csv(f'clean_data\\{sheet_name}.csv', index=False)

2. Lemmatization

In [9]:
from nltk import word_tokenize, WordNetLemmatizer

In [10]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)  # Tokenize text
    return " ".join([lemmatizer.lemmatize(word) for word in words])

In [11]:
import os

for file in os.listdir('./clean_data'):
    print(file)
    sheet = pd.read_csv(f'clean_data\\{file}')
    sheet = sheet.map(lambda x: lemmatize_text(x) if isinstance(x, str) else x)
    sheet.to_csv(f"lemmatize_data\\{file}", index=False)



A-J.csv
BBC.csv
J-P.csv
NY-T.csv


BM25

In [12]:
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import nltk
from scipy.sparse import save_npz
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

min_threshold = 5

In [13]:
def save_metadata(word_index, output_file):
    # save for feature use
    # Convert the dictionary to a DataFrame
    metadata_df = pd.DataFrame(list(word_index.items()), columns=["Word", "Index"])

    # Save the DataFrame to a CSV file
    metadata_df.to_csv(output_file, index=False)
    print(f"Metadata saved to {output_file}")


In [14]:
from collections import defaultdict

def get_word_freq(corpus):
    word_frequency = defaultdict(int)

    # Count occurrences
    for document in corpus:
        unique_words = set(document)  # Use a set to avoid counting duplicates in the same document
        for word in unique_words:
            word_frequency[word] += 1

    return  word_frequency

In [15]:
def remove_stopwords(text):
    words = text.split()  # Tokenize the text
    return [word.lower() for word in words if word.lower() not in stop_words]

bm25 to the lemmatized docs

In [16]:
for file in os.listdir('lemmatize_data'):
    print(file)
    sheet = pd.read_csv(f'lemmatize_data\\{file}')

    # Build the corpus
    if file == 'A-J.csv':
        corpus = [
            remove_stopwords(f'{row["title"]} {row["sub_title"]} {row["Body Text"]}')
            for _, row in sheet.iterrows()
        ]
    else:
        corpus = [
            remove_stopwords(f'{row["title"]} {row["Body Text"]}')
            for _, row in sheet.iterrows()
        ]

    bm25 = BM25Okapi(corpus)
    word_frequency = get_word_freq(corpus)
    all_words = [word for word in bm25.idf.keys() if word_frequency[word] > min_threshold]
    word_index = {word: idx for idx, word in enumerate(all_words)}
    save_metadata(word_index, f'metadata\\lemma\\{file}')

    rows, cols, data = [], [], []
    for word in all_words:
        scores = bm25.get_scores(word)
        for doc_idx, score in enumerate(scores):
            if score > 0:  # Only include non-zero scores to keep it sparse
                rows.append(doc_idx)  # Document index
                cols.append(word_index[word])  # Word index
                data.append(score)  # BM25 score

    sparse_bm25_matrix = csr_matrix((data, (rows, cols)), shape=(len(corpus), len(all_words)))

    save_npz(f'bm25\\lemma\\{file.split(".")[0]}', sparse_bm25_matrix)

A-J.csv
Metadata saved to metadata\lemma\A-J.csv
BBC.csv
Metadata saved to metadata\lemma\BBC.csv
J-P.csv
Metadata saved to metadata\lemma\J-P.csv
NY-T.csv
Metadata saved to metadata\lemma\NY-T.csv


bm25 to the clean docs

In [17]:
for file in os.listdir('clean_data'):
    print(file)
    sheet = pd.read_csv(f'clean_data\\{file}')

    # Build the corpus
    if file == 'A-J.csv':
        corpus = [
            remove_stopwords(f'{row["title"]} {row["sub_title"]} {row["Body Text"]}') for _, row in sheet.iterrows()
        ]
    else:
        corpus = [
            remove_stopwords(f'{row["title"]} {row["Body Text"]}') for _, row in sheet.iterrows()
        ]

    bm25 = BM25Okapi(corpus)
    word_frequency = get_word_freq(corpus)
    all_words = [word for word in bm25.idf.keys() if word_frequency[word] >= min_threshold]
    word_index = {word: idx for idx, word in enumerate(all_words)}
    save_metadata(word_index, f'metadata\\clean\\{file}')

    rows, cols, data = [], [], []
    for word in all_words:
        scores = bm25.get_scores(word)
        for doc_idx, score in enumerate(scores):
            if score > 0:  # Only include non-zero scores to keep it sparse
                rows.append(doc_idx)  # Document index
                cols.append(word_index[word])  # Word index
                data.append(score)  # BM25 score


    sparse_bm25_matrix = csr_matrix((data, (rows, cols)), shape=(len(corpus), len(all_words)))

    save_npz(f'bm25\\clean\\{file.split(".")[0]}', sparse_bm25_matrix)

A-J.csv
Metadata saved to metadata\clean\A-J.csv
BBC.csv
Metadata saved to metadata\clean\BBC.csv
J-P.csv
Metadata saved to metadata\clean\J-P.csv
NY-T.csv
Metadata saved to metadata\clean\NY-T.csv


3: IG

In [18]:
from scipy.sparse import load_npz
import numpy as np
from sklearn.feature_selection import mutual_info_classif


IG for lemmatized

In [19]:
for file in os.listdir('bm25\\lemma'):
    # Load the sparse BM25 matrix
    sparse_bm25_matrix = load_npz(f'bm25\\lemma\\{file}')

    # Sum the BM25 scores for each word across all documents (proxy for importance)
    word_scores = np.array(sparse_bm25_matrix.sum(axis=0)).flatten()  # Sum along columns

    # Load word-to-index mapping
    word_metadata = pd.read_csv(f"metadata\\lemma\\{file.split('.')[0]}.csv")

    # Map scores to words
    df_word_scores = pd.DataFrame({
        "Word": word_metadata["Word"],
        "Score": word_scores
    }).sort_values(by="Score", ascending=False)

    # Save to CSV
    df_word_scores.to_csv(f"IG\\lemma\\{file.split('.')[0]}.csv", index=False)

    # Display top words by score
    print(df_word_scores.head())

         Word        Score
14          .  1140.336991
13          ’   886.367915
3           ,   857.445048
295   country   173.383853
117  saturday   173.383853
              Word        Score
4319      built-up  2582.526583
3900     us-funded  2568.446341
2155  unsuccessful  2499.764703
3467       unusual  2473.820499
1927         co.uk  2419.119081
         Word        Score
4061    1,200  2125.162599
1205    1,500  2115.573944
3477  100,000  2097.791597
3510    1,400  2094.009603
3594   10,000  2074.861862
         Word       Score
398     1,200  904.589510
1076   35,000  899.313100
14          ,  838.990794
58          ’  649.898051
294   u.c.l.a  253.451491


IG for clean

In [20]:
for file in os.listdir('bm25\\clean'):
    # Load the sparse BM25 matrix
    sparse_bm25_matrix = load_npz(f'bm25\\clean\\{file}')

    # Sum the BM25 scores for each word across all documents (proxy for importance)
    word_scores = np.array(sparse_bm25_matrix.sum(axis=0)).flatten()  # Sum along columns

    # Load word-to-index mapping
    word_metadata = pd.read_csv(f"metadata\\clean\\{file.split('.')[0]}.csv")

    # Map scores to words
    df_word_scores = pd.DataFrame({
        "Word": word_metadata["Word"],
        "Score": word_scores
    }).sort_values(by="Score", ascending=False)

    # Save to CSV
    df_word_scores.to_csv(f"IG\\clean\\{file.split('.')[0]}.csv", index=False)

    # Display top words by score
    print(df_word_scores.head())

    Word        Score
13     .  1147.595128
3      ,   862.656498
104    :   163.927919
130    ?   158.371471
40     “   152.143006
         Word        Score
3676      1.7  2259.624247
3680    1,700  2255.698013
1965  576,000  2227.291317
5191      2.7  2169.278964
5476   37,000  2150.609325
                        Word        Score
1823       ldquo;we&rsquo;re  4182.916428
1219        ldquo;it&rsquo;s  4139.154511
1996         ldquo;i&rsquo;m  4139.154511
3752        ben-gvir&rsquo;s  3011.835327
682   ldquo;resistance&rdquo  2893.955323
                          Word       Score
274   storytelling,downloadthe  971.557259
234      storytelling,download  943.473888
419                      1,200  907.370295
1255                    35,000  902.047393
884                     30,000  886.152828


Word2Vec

In [21]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Load Google's pre-trained Word2Vec model (requires downloading)
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [22]:
def preprocess_text(text):
    # Remove punctuation and numbers
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)      # Remove digits
    # Tokenize the text
    return text

In [23]:
for file in os.listdir('lemmatize_data'):
    print(file)
    sheet = pd.read_csv(f'lemmatize_data\\{file}')

    if file == 'A-J.csv':
        corpus = [
            remove_stopwords(preprocess_text(f'{row["title"]} {row["sub_title"]} {row["Body Text"]}')) for _, row in sheet.iterrows()
        ]
    else:
        corpus = [
            remove_stopwords(preprocess_text(f'{row["title"]} {row["Body Text"]}')) for _, row in sheet.iterrows()
        ]

    docs_matrix = []
    for doc_idx, row in enumerate(corpus):
        matrix = []
        for word in row:
            if word in model:
                matrix.append(model[word].tolist())
            else:
                matrix.append([0] * model.vector_size)  # Placeholder for missing words

        docs_matrix.append(np.array(matrix).mean(axis=0))
    pd.DataFrame(docs_matrix).to_csv(f'Word2Vec\\lemma\\{file}', index=False)




A-J.csv
BBC.csv
J-P.csv
NY-T.csv


In [24]:
for file in os.listdir('clean_data'):
    print(file)
    sheet = pd.read_csv(f'clean_data\\{file}')

    if file == 'A-J.csv':
        corpus = [
            remove_stopwords(preprocess_text(f'{row["title"]} {row["sub_title"]} {row["Body Text"]}')) for _, row in sheet.iterrows()
        ]
    else:
        corpus = [
            remove_stopwords(preprocess_text(f'{row["title"]} {row["Body Text"]}')) for _, row in sheet.iterrows()
        ]
    docs_matrix = []
    for doc_idx, row in enumerate(corpus):
        matrix = []
        for word in row:
            if word in model:
                matrix.append(model[word].tolist())
            else:
                matrix.append([0] * model.vector_size)  # Placeholder for missing words

        docs_matrix.append(np.array(matrix).mean(axis=0))

    pd.DataFrame(docs_matrix).to_csv(f'Word2Vec\\clean\\{file}', index=False)



A-J.csv
BBC.csv
J-P.csv
NY-T.csv


Doc2Vec on original

In [25]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument


In [26]:
original = df.copy()
corpus = []
j = 0
for sheet_name, sheet_df in original.items():
    # Prepare the corpus as a list of TaggedDocument objects
    if sheet_name == 'A-J':
        corpus += [
            TaggedDocument(words=f'{row["title"]} {row["sub_title"]} {row["Body Text"]}'.split(), tags=[str(i+j*1000)])
            for i, row in sheet_df.iterrows()
        ]
    else:
        corpus += [
            TaggedDocument(words=f'{row["title"]} {row["Body Text"]}'.split(), tags=[str(i+j*1000)])
            for i, row in sheet_df.iterrows()
        ]
    j+=1

model = Doc2Vec(documents=corpus, vector_size=300, window=5, min_count=5, workers=4)

document_vectors = [model.dv[tag] for tag in model.dv.index_to_key]

# Save document vectors to a CSV file
document_vectors = pd.DataFrame(document_vectors)
document_vectors.iloc[0:599].to_csv("Doc2Vec\\A-J.csv", index=False)
document_vectors.iloc[599:1148].to_csv("Doc2Vec\\BBC.csv", index=False)
document_vectors.iloc[1148:1747].to_csv("Doc2Vec\\J-P.csv", index=False)
document_vectors.iloc[1747:2346].to_csv("Doc2Vec\\NY-T.csv", index=False)

BERT

In [27]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

for sheet_name, sheet_df in original.items():
    # Prepare the corpus as a list of TaggedDocument objects
    if sheet_name == 'A-J':
        corpus = [
            f'{row["title"]} {row["sub_title"]} {row["Body Text"]}'.lower() for _, row in sheet_df.iterrows()
        ]
    else:
        corpus = [
            f'{row["title"]} {row["Body Text"]}'.lower() for _, row in sheet_df.iterrows()
        ]

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # print(device)
    # model = model.to(device)
    # Tokenize and encode the corpus
    inputs = tokenizer(corpus, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract token embeddings from the last hidden state
    last_hidden_state = outputs.last_hidden_state  # Shape: [batch_size, sequence_length, hidden_size]

    # Aggregate token embeddings to get sentence/document-level embeddings (e.g., mean pooling)
    corpus_embeddings = torch.mean(last_hidden_state, dim=1)  # Shape: [batch_size, hidden_size]

    # Convert to numpy array for easier manipulation
    corpus_embeddings = corpus_embeddings.numpy()

    print(corpus_embeddings.shape)  # (num_documents, hidden_size)

    pd.DataFrame(corpus_embeddings).to_csv(f"Bert\\{sheet_name}.csv", index=False)

(599, 768)
(549, 768)
(599, 768)
(599, 768)


In [31]:
from sentence_transformers import SentenceTransformer

# Load pre-trained Sentence-BERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

for sheet_name, sheet_df in original.items():
    # Prepare the corpus as a list of TaggedDocument objects
    if sheet_name == 'A-J':
        corpus = [
            f'{row["title"]} {row["sub_title"]} {row["Body Text"]}' for _, row in sheet_df.iterrows()
        ]
    else:
        corpus = [
            f'{row["title"]} {row["Body Text"]}' for _, row in sheet_df.iterrows()
        ]

    # Generate embeddings for the corpus
    corpus_embeddings = model.encode(corpus)

    # Check the shape of the embeddings
    print(corpus_embeddings.shape)  # (num_documents, embedding_size)

    pd.DataFrame(corpus_embeddings).to_csv(f"Bert-Sentence\\{sheet_name}.csv", index=False)

(599, 384)
(549, 384)
(599, 384)
(599, 384)
