## Preprocessing

In [30]:
import pandas as pd

# Read the JSON file into a DataFrame
df = pd.read_csv('nytimes_articles.csv')

# Display the DataFrame
print(df.head())

                                            headline  \
0        Power by Proxy: How Iran Shapes the Mideast   
1  Arrests Expose Rift Between N.Y.P.D. and ‘Viol...   
2  Pardoned for Serving in Ukraine, They Return t...   
3  Berlin Was a Beacon of Artistic Freedom. Gaza ...   
4        The New York Times News Quiz, April 5, 2024   

                                             article  \
0  Advertisement    and     TURKEY Militias in Sy...   
1  Advertisement Supported by An outreach worker ...   
2  Advertisement Supported by Recruiting convicts...   
3  Critic’s Notebook The home of boundary-pushing...   
4  Advertisement Did you follow the news this wee...   

                                                text  \
0  Power by Proxy: How Iran Shapes the Mideast Ad...   
1  Arrests Expose Rift Between N.Y.P.D. and ‘Viol...   
2  Pardoned for Serving in Ukraine, They Return t...   
3  Berlin Was a Beacon of Artistic Freedom. Gaza ...   
4  The New York Times News Quiz, April 5, 2024

In [32]:
df.shape

(47, 4)

In [31]:
df.headline.value_counts()

headline
Power by Proxy: How Iran Shapes the Mideast                                                                1
Can the Left Be Happy?                                                                                     1
Regulating Opioid Prescriptions                                                                            1
The Birth Dearth and the Smartphone Age                                                                    1
Three Democratic Senators Are Stuck Indulging an Outdated Fantasy                                          1
Joe Lieberman and David Mixner, Exemplary Outsiders                                                        1
Are Smartphones Just a Scapegoat for Our Unhappy Children?                                                 1
Why Richard Serra’s                                                                                        1
My Story Was Told in ‘Hotel Rwanda.’ Here’s What I Want the World to Know Now.                             1
Who Is Jor

In [2]:
def concatenate(row):
    headline = row["headline"]
    article = row["article"]
    return headline+" "+article

In [3]:
df["text"] = df.apply(concatenate,axis=1)
df.head(2)

Unnamed: 0,headline,article,text,processed_text
0,Power by Proxy: How Iran Shapes the Mideast,Advertisement and TURKEY Militias in Sy...,Power by Proxy: How Iran Shapes the Mideast Ad...,"['power', 'proxi', ':', 'iran', 'shape', 'mide..."
1,Arrests Expose Rift Between N.Y.P.D. and ‘Viol...,Advertisement Supported by An outreach worker ...,Arrests Expose Rift Between N.Y.P.D. and ‘Viol...,"['arrest', 'expos', 'rift', 'n.y.p.d', '.', '‘..."


In [4]:
df["text"][0]

'Power by Proxy: How Iran Shapes the Mideast Advertisement    and     TURKEY Militias in Syria and Iraq SYRIA LEBANON Iran Hezbollah ISRAEL IRAQ Hamas Egypt Saudi Arabia OMAN The Houthis Sudan YEMEN 500 miles TURKEY Militias in Syria and Iraq LEBANON SYRIA Iran Hezbollah IRAQ ISRAEL Hamas Egypt Saudi Arabia The Houthis Sudan YEMEN 500 miles The New York Times For years, Iran has been the outsider. Predominantly Persian-speaking in a region where most people speak Arabic, overwhelmingly Shiite where most are Sunni, it has been crippled by Western sanctions meant to make it a pariah. Yet Iran has succeeded in projecting its military power across a large swath of the Middle East. Its reach equals — if not eclipses — that of traditional power centers like Egypt and Saudi Arabia. And now, spurred by the war in the Gaza Strip, armed groups that Iran has fostered over the past 45 years have mobilized simultaneously toward similar goals: diminishing Israeli power and confronting its closest al

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Shaik
[nltk_data]     Ifran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Shaik
[nltk_data]     Ifran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return stemmed_tokens

In [7]:
# Assuming 'concatenated_text' is the name of your concatenated column
df['processed_text'] = df['text'].apply(preprocess_text)
df.head(2)

Unnamed: 0,headline,article,text,processed_text
0,Power by Proxy: How Iran Shapes the Mideast,Advertisement and TURKEY Militias in Sy...,Power by Proxy: How Iran Shapes the Mideast Ad...,"[power, proxi, :, iran, shape, mideast, advert..."
1,Arrests Expose Rift Between N.Y.P.D. and ‘Viol...,Advertisement Supported by An outreach worker ...,Arrests Expose Rift Between N.Y.P.D. and ‘Viol...,"[arrest, expos, rift, n.y.p.d, ., ‘, violenc, ..."


## TF-IDF Vectorization

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the processed_text column to compute TF-IDF scores
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'].apply(lambda x: ' '.join(x)))

# Convert TF-IDF matrix to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
tfidf_df.head()

Unnamed: 0,000,05,07,10,100,102,108,10th,11,11th,...,zawada,zelenski,zero,zionism,zionist,zip,zohar,zone,zour,zuckerberg
0,0.056524,0.0,0.0,0.005139,0.033564,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023506,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.056993,0.0,0.0,0.022797,0.0,0.0,0.0,0.0,0.032056,0.0,...,0.0,0.021746,0.021746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.008714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017997,0.019931,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.031637,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidf_vectorizer.get_feature_names_out()

array(['000', '05', '07', ..., 'zone', 'zour', 'zuckerberg'], dtype=object)

In [10]:
tfidf_df.shape,df.shape

((47, 7196), (47, 4))

In [11]:
import pickle

# Save DataFrames to CSV files
df.to_csv('nytimes_articles.csv', index=False)
tfidf_df.to_csv('tfidf_matrix.csv', index=False)

# Save TF-IDF vectorizer to pickle file
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)


## Word2Vec Vectorization

In [12]:
import gensim.downloader as api
import numpy as np
import pandas as pd

# Download the pre-trained Word2Vec model
model = api.load("word2vec-google-news-300")

# Generate Word2Vec embeddings for each word
def generate_word_embeddings(words):
    embeddings = []
    for word in words:
        try:
            embeddings.append(model[word])
        except KeyError:
            # If word not in vocabulary, ignore or handle accordingly
            pass
    return embeddings

def calculate_average_embedding(embeddings):
    if not embeddings:
        # If no embeddings found, return None or zeros
        return np.zeros(model.vector_size)
    return np.mean(embeddings, axis=0)

# Create a new dataframe for storing embeddings
embeddings_df = pd.DataFrame()

# Get embeddings for each tokenized text and assign to a new column in embeddings_df
df['word_embeddings'] = df['processed_text'].apply(generate_word_embeddings)

embeddings_vec = df['word_embeddings'].apply(calculate_average_embedding)
# Now you have the Word2Vec embeddings in a separate dataframe 'embeddings_df'

In [13]:
type(embeddings_vec)

pandas.core.series.Series

In [14]:
embeddings_vec[1].shape

(300,)

In [15]:
import pickle

with open('embeddings_vec.pkl', 'wb') as f:
    pickle.dump(embeddings_vec, f)