In [45]:
import json
import pandas as pd
import re 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [46]:
import nltk 
nltk.download('punkt') 
nltk.download('wordnet') 
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [52]:
def load_json(filename): 
    with open(filename, "r", encoding="utf-8") as f: 
        return json.load(f)

In [53]:
news_articles = load_json("combined_unique_articles.json") 
mnt_article = load_json("medicalnewstoday_article.json") 
reddit_posts = load_json("reddit_posts.json")

In [54]:
def extract_news_text(record): 
    # We assume the 'content' field holds the article text. 
    return record.get("content", "")

In [55]:
def extract_mnt_text(record): 
    return record.get("content", "")

In [56]:
def extract_reddit_text(record): 
    title = record.get("title", "") 
    body = record.get("selftext", "") # Sometimes selftext is empty; if so, we use the title only. 
    return title + "\n" + body if body else title

In [57]:
news_texts = [extract_news_text(rec) for rec in news_articles if extract_news_text(rec).strip() != ""] 
mnt_texts = [extract_mnt_text(mnt_article)] if extract_mnt_text(mnt_article).strip() != "" else [] 
reddit_texts = [extract_reddit_text(rec) for rec in reddit_posts if extract_reddit_text(rec).strip() != ""]

In [58]:
all_texts = news_texts + mnt_texts + reddit_texts 
df = pd.DataFrame({"text": all_texts}) 
print("Total records loaded:", len(df))

Total records loaded: 793


In [60]:
df

Unnamed: 0,text
0,Bunge Global (NYSE:BG – Get Free Report) and A...
1,GMO U.S. Quality ETF (NYSEARCA:QLTY – Get Free...
2,"LAKELAND, Fla., Feb. 14, 2025 (GLOBE NEWSWIRE)..."
3,We independently select these products—if you ...
4,If youve been on the hunt for a way to boost y...
...,...
788,GMOs are key to reducing pesticide use in farm...
789,Genetically modified crops: A success story in...
790,GMOs: Improving nutrition and food quality for...
791,Why I support GMOs: A farmer's perspective.\nW...


In [61]:
def basic_clean(text): # Lowercase and remove non-alphabetic characters (keep spaces) 
    text = text.lower() 
    text = re.sub(r'[^a-z\s]', ' ', text) 
    text = re.sub(r'\s+', ' ', text).strip() 
    return text

df["clean_text"] = df["text"].apply(basic_clean)

In [65]:
df = df.drop(columns=['text'])

In [66]:
df

Unnamed: 0,clean_text
0,bunge global nyse bg get free report and austr...
1,gmo u s quality etf nysearca qlty get free rep...
2,lakeland fla feb globe newswire a new turmeric...
3,we independently select these products if you ...
4,if youve been on the hunt for a way to boost y...
...,...
788,gmos are key to reducing pesticide use in farm...
789,genetically modified crops a success story in ...
790,gmos improving nutrition and food quality for ...
791,why i support gmos a farmer s perspective with...


In [69]:
analyzer = SentimentIntensityAnalyzer()

def assign_label(text): 
    score = analyzer.polarity_scores(text)["compound"] # Using thresholds: positive score indicates pro-GMO stance, # negative indicates anti-GMO. We'll split into four categories. 
    if score >= 0.6: 
        return "Strongly Pro-GMO" 
    elif score >= 0.01 and score < 0.6:
        return "Mildly Pro-GMO" 
    elif score >= -0.4 and score < 0.01: 
        return "Mildly Anti-GMO"    
    else: 
        return "Strongly Anti-GMO"

df["label"] = df["clean_text"].apply(assign_label) 
print("Label distribution:") 
print(df["label"].value_counts())

Label distribution:
label
Mildly Anti-GMO      211
Mildly Pro-GMO       209
Strongly Pro-GMO     194
Strongly Anti-GMO    179
Name: count, dtype: int64


In [73]:
df

Unnamed: 0,clean_text,label
0,bunge global nyse bg get free report and austr...,Strongly Pro-GMO
1,gmo u s quality etf nysearca qlty get free rep...,Strongly Pro-GMO
2,lakeland fla feb globe newswire a new turmeric...,Strongly Anti-GMO
3,we independently select these products if you ...,Mildly Anti-GMO
4,if youve been on the hunt for a way to boost y...,Strongly Pro-GMO
...,...,...
788,gmos are key to reducing pesticide use in farm...,Mildly Pro-GMO
789,genetically modified crops a success story in ...,Strongly Pro-GMO
790,gmos improving nutrition and food quality for ...,Strongly Pro-GMO
791,why i support gmos a farmer s perspective with...,Strongly Pro-GMO


In [71]:
# ---------------------------------------------
# 1. Create DataFrame with Stemming
# ---------------------------------------------
ps = PorterStemmer()

def stem_text(text):
    tokens = word_tokenize(text)
    return " ".join([ps.stem(token) for token in tokens])

df_stem = df.copy()
df_stem['stemmed'] = df_stem['clean_text'].apply(stem_text)

# Reorder columns so that "label" is the first column.
cols = df_stem.columns.tolist()
cols.insert(0, cols.pop(cols.index("label")))
df_stem = df_stem[cols]

# Optionally, save to CSV
df_stem.to_csv("df_stem.csv", index=False)


# ---------------------------------------------
# 2. Create DataFrame with Lemmatization
# ---------------------------------------------
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(token) for token in tokens])

df_lemmatize = df.copy()
df_lemmatize['lemmatized'] = df_lemmatize['clean_text'].apply(lemmatize_text)

cols = df_lemmatize.columns.tolist()
cols.insert(0, cols.pop(cols.index("label")))
df_lemmatize = df_lemmatize[cols]

df_lemmatize.to_csv("df_lemmatize.csv", index=False)


# ---------------------------------------------
# 3. Create DataFrame using CountVectorizer
# ---------------------------------------------
# Parameters: max_df=0.9, min_df=2, max_features=1000 (adjust as needed)
count_vect = CountVectorizer(max_df=0.9, min_df=2, max_features=1000)
X_count = count_vect.fit_transform(df['clean_text'])
df_count = pd.DataFrame(X_count.toarray(), columns=count_vect.get_feature_names_out())

# Add labels as a column from the original df.
df_count['label'] = df['label'].values

cols = df_count.columns.tolist()
cols.insert(0, cols.pop(cols.index("label")))
df_count = df_count[cols]

df_count.to_csv("df_count.csv", index=False)


# ---------------------------------------------
# 4. Create DataFrame using TfidfVectorizer
# ---------------------------------------------
tfidf_vect = TfidfVectorizer(max_df=0.9, min_df=2, max_features=1000)
X_tfidf = tfidf_vect.fit_transform(df['clean_text'])
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())

df_tfidf['label'] = df['label'].values

cols = df_tfidf.columns.tolist()
cols.insert(0, cols.pop(cols.index("label")))
df_tfidf = df_tfidf[cols]

df_tfidf.to_csv("df_tfidf.csv", index=False)

print("DataFrames created and saved as CSV files.")

DataFrames created and saved as CSV files.


In [76]:
print("Stemmed DataFrame:")
#df_stem = df_stem.drop(columns=['clean_text'])
print(df_stem)
print("\n---------------------------------\n")

print("Lemmatized DataFrame:")
#df_lemmatize = df_lemmatize.drop(columns=['clean_text'])
print(df_lemmatize)
print("\n---------------------------------\n")

print("CountVectorizer DataFrame:")
print(df_count)
print("\n---------------------------------\n")

print("TfidfVectorizer DataFrame:")
print(df_tfidf)


Stemmed DataFrame:
                 label                                            stemmed
0     Strongly Pro-GMO  bung global nyse bg get free report and austra...
1     Strongly Pro-GMO  gmo u s qualiti etf nysearca qlti get free rep...
2    Strongly Anti-GMO  lakeland fla feb globe newswir a new turmer ha...
3      Mildly Anti-GMO  we independ select these product if you buy fr...
4     Strongly Pro-GMO  if youv been on the hunt for a way to boost yo...
..                 ...                                                ...
788     Mildly Pro-GMO  gmo are key to reduc pesticid use in farm i m ...
789   Strongly Pro-GMO  genet modifi crop a success stori in agricultu...
790   Strongly Pro-GMO  gmo improv nutrit and food qualiti for everyon...
791   Strongly Pro-GMO  whi i support gmo a farmer s perspect with car...
792     Mildly Pro-GMO  gmo feed the world and fight malnutrit farmer ...

[793 rows x 2 columns]

---------------------------------

Lemmatized DataFrame:
           