#Exploring the Relationship Between Connotation of News Articles and Company Revenue Using Sentiment Analysis

##All Imports

In [2]:
#Standard imports
import pandas as pd
import numpy as np
import random
import plotly.express as px

In [3]:
#Natural Language Toolkit (NLTK) import and pre-trained model and other resources download
import nltk
nltk.download('all')
#45s runtime

#Other NLKT imports
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#ML imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

##Data Cleaning

In [None]:
p = 0.01  # Take 1% of the population to generate a sample
#Original and raw dataset is from https://www.kaggle.com/datasets/aryansingh0909/nyt-articles-21m-2000-present
nyt = pd.read_csv(r"content/nyt-metadata.csv", header=0, skiprows=lambda i: i>0 and random.random() > p, low_memory=False)
nyt = nyt.drop(columns=(['web_url', 'uri', '_id', 'byline', 'subsection_name', 'document_type','multimedia', 'source', 'snippet', 'keywords']))

# Clean headline column and create new column
headline = []
start = "'main': "
end = ", 'kicker': "
i = 0
for value in nyt["headline"]:
    idx1 = value.index(start)
    idx2 = value.index(end)
    headline = np.append(headline,value[idx1 + len(start) + 1: idx2-1])

# Create new column
nyt['headlines'] = headline

# Drop old headline column
nyt = nyt.drop(columns=(['headline']))

# Rearrange column order (personal preference)
new_cols = ['headlines', 'abstract', 'lead_paragraph', 'print_section', 'print_page', 'pub_date', 'news_desk', 'section_name', 'type_of_material', 'word_count']
nyt=nyt.reindex(columns=new_cols)

# Export new dataset
nyt.to_csv('nyt-metadata-SAMPLE.csv', index=False)

In [None]:
#Read in sample data
nyt = pd.read_excel(r"/content/nyt-metadata-SAMPLE.xlsx")
#15s runtime

#Select only relevant columns (drop blank trailing columns)
nyt = nyt.iloc[:,0:10]

#Convert headlines column to strings
nyt['headlines'] = nyt['headlines'].astype(str)

nyt.head()

Unnamed: 0,headlines,abstract,lead_paragraph,print_section,print_page,pub_date,news_desk,section_name,type_of_material,word_count
0,"On This First Day, a Fanfare for the New Era; ...","Anne Lord Witt letter, replying to Joyce Carol...",To the Editor:,A,30,2000-01-01 05:00:00+00:00,Editorial Desk,Opinion,Letter,129
1,Manifestoes To Give City A New Edge,Range of influential experts offer ideas on ki...,For anyone who thinks New York City institutio...,E,28,2000-01-01 05:00:00+00:00,The Millennium,Archives,News,134
2,"Paid Notice: Deaths CLOSE, MARY ''MOLLY'' G.","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...",1,35,2000-01-02 05:00:00+00:00,Classified,Archives,Paid Death Notice,128
3,Dec. 26 - Jan. 1; China Sentences 4 Members Of...,Sending a harsh message to followers of the Fa...,Sending a harsh message to followers of the Fa...,4,2,2000-01-02 05:00:00+00:00,Week in Review Desk,Week in Review,News,105
4,What Do the Stars Say? Boot Up and Find Out,Fortune tellers in New York's ethnic communiti...,"THIS time of year, predictions are as plentifu...",14,3,2000-01-02 05:00:00+00:00,The City Weekly Desk,New York,News,285


In [None]:
#Text preprocessing function
def textPreprocess(string):
  #Tokenizing text
  tokens = word_tokenize(string.lower())

  #Removing stop words
  filteredTokens = [token for token in tokens if token not in stopwords.words("english")]

  #Lemmatizing tokens
  lemmatizer = WordNetLemmatizer()
  lemmatizedTokens = [lemmatizer.lemmatize(token) for token in filteredTokens]

  #Joining tokens back together
  processedString = ' '.join(lemmatizedTokens)

  return processedString

In [None]:
#Initializing NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

#Sentiment fetching function
def sentimentFetch(string):
  scores = analyzer.polarity_scores(string)
  sentiment = scores["compound"]
  return sentiment

In [None]:
#Process headlines and create new sentiment column
nyt["compound_sentiment"] = nyt["headlines"].apply(textPreprocess).apply(sentimentFetch)
#26s runtime

In [None]:
#Positive, neutral, or negative sentiment definer function
def compoundToDirection(score):
  if score >= 0.05:
    return 1
  elif score <= -0.05:
    return -1
  return 0

In [None]:
#Create sentiment direction column
nyt["sentiment_direction"] = nyt["compound_sentiment"].apply(compoundToDirection)

In [None]:
#Extract only publication year of article and create column
nyt["pub_year"] = nyt["pub_date"].str[:4]
nyt['pub_year'] = nyt['pub_year'].astype(str)

# Export sentiment dataset
nyt.to_csv("nyt-sentiment.csv",index=False)

nyt.head()

Unnamed: 0,headlines,abstract,lead_paragraph,print_section,print_page,pub_date,news_desk,section_name,type_of_material,word_count,compound_sentiment,sentiment_direction,pub_year
0,"On This First Day, a Fanfare for the New Era; ...","Anne Lord Witt letter, replying to Joyce Carol...",To the Editor:,A,30,2000-01-01 05:00:00+00:00,Editorial Desk,Opinion,Letter,129,0.4404,1,2000
1,Manifestoes To Give City A New Edge,Range of influential experts offer ideas on ki...,For anyone who thinks New York City institutio...,E,28,2000-01-01 05:00:00+00:00,The Millennium,Archives,News,134,0.0,0,2000
2,"Paid Notice: Deaths CLOSE, MARY ''MOLLY'' G.","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...",1,35,2000-01-02 05:00:00+00:00,Classified,Archives,Paid Death Notice,128,-0.5994,-1,2000
3,Dec. 26 - Jan. 1; China Sentences 4 Members Of...,Sending a harsh message to followers of the Fa...,Sending a harsh message to followers of the Fa...,4,2,2000-01-02 05:00:00+00:00,Week in Review Desk,Week in Review,News,105,0.0772,1,2000
4,What Do the Stars Say? Boot Up and Find Out,Fortune tellers in New York's ethnic communiti...,"THIS time of year, predictions are as plentifu...",14,3,2000-01-02 05:00:00+00:00,The City Weekly Desk,New York,News,285,0.0,0,2000


###Sentiment Dataset Cleaning

In [19]:
#Read in sentiment data
nyt_s = pd.read_csv("/content/nyt-sentiment.csv")

#Convert pub_year column to pandas timestamp
nyt_s['pub_year'] = pd.to_datetime(nyt_s['pub_year'], errors='coerce')
#Drop any NaT values resulting from coerced conversion
nyt_s = nyt_s.dropna(axis=0,subset=['pub_year'])

#Find absolute value of compound_sentiment so that it correlates to strength (directionality is recorded in sentiment_direction)
nyt_s["compound_sentiment"] = nyt_s["compound_sentiment"].apply(abs)

nyt_s.head()

Unnamed: 0,headlines,abstract,lead_paragraph,print_section,print_page,pub_date,news_desk,section_name,type_of_material,word_count,compound_sentiment,sentiment_direction,pub_year
0,"On This First Day, a Fanfare for the New Era; ...","Anne Lord Witt letter, replying to Joyce Carol...",To the Editor:,A,30,2000-01-01 05:00:00+00:00,Editorial Desk,Opinion,Letter,129,0.4404,1,2000-01-01
1,Manifestoes To Give City A New Edge,Range of influential experts offer ideas on ki...,For anyone who thinks New York City institutio...,E,28,2000-01-01 05:00:00+00:00,The Millennium,Archives,News,134,0.0,0,2000-01-01
2,"Paid Notice: Deaths CLOSE, MARY ''MOLLY'' G.","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...",1,35,2000-01-02 05:00:00+00:00,Classified,Archives,Paid Death Notice,128,0.5994,-1,2000-01-01
3,Dec. 26 - Jan. 1; China Sentences 4 Members Of...,Sending a harsh message to followers of the Fa...,Sending a harsh message to followers of the Fa...,4,2,2000-01-02 05:00:00+00:00,Week in Review Desk,Week in Review,News,105,0.0772,1,2000-01-01
4,What Do the Stars Say? Boot Up and Find Out,Fortune tellers in New York's ethnic communiti...,"THIS time of year, predictions are as plentifu...",14,3,2000-01-02 05:00:00+00:00,The City Weekly Desk,New York,News,285,0.0,0,2000-01-01


###Revenue Dataset Cleaning

In [20]:
#Read in revenue data
nyt_r = pd.read_excel("/content/New York Times Revenue.xlsx")
#Override Year column to only inlcude year
nyt_r["Year"] = nyt_r["Year"].dt.year

nyt_r.head()

Unnamed: 0,Year,Revenue (in Billions),Change (in Millions),Growth,CPI,Revenue_Adjusted (in 2022 dollars)
0,2022,2.31,233.44,0.1125,292.7,2.31
1,2021,2.07,291.24,0.1633,271.0,2.235753
2,2020,1.78,-28.55,-0.0158,258.8,2.013161
3,2019,1.81,63.59,0.0364,255.7,2.071908
4,2018,1.75,72.96,0.0435,251.1,2.039924


##ML Dataset Creation

In [21]:
#Function for standardizing independent variables
def standard_units(arr):
    return (arr - np.mean(arr)) / np.std(arr)

In [31]:
#Groupby year and find average of compound sentiment and sentiment direction -> strore into dataframe and join by year
nyt_ml = nyt_s.groupby(nyt_s["pub_year"].dt.year)['compound_sentiment'].mean().to_frame().join(nyt_s.groupby(nyt_s["pub_year"].dt.year)['sentiment_direction'].mean().to_frame(), on="pub_year")

print(nyt_ml.head())
#Record 2023 compound_sentimnet and sentiment_direction
sentiment2023 = nyt_ml.apply(standard_units).iloc[[23]]
print(nyt_ml.apply(standard_units).tail())

#Reset index so that Year is it's own column
nyt_ml = nyt_ml.reset_index()
nyt_ml = nyt_ml.rename(columns={"pub_year":"Year"})

#Merge with revenue dataset on Year column
nyt_ml = nyt_ml.merge(nyt_r[["Year","Revenue_Adjusted (in 2022 dollars)"]], on="Year")
#Rename adjusted revenue column
nyt_ml = nyt_ml.rename(columns={"Revenue_Adjusted (in 2022 dollars)":"Adjusted 2022 Revenue in Billions USD"})

# Export new revenue and sentiment dataset
nyt_ml.to_csv("nyt_sentiment_revenue.csv",index=False)

nyt_ml.head()

          compound_sentiment  sentiment_direction
pub_year                                         
2000                0.226935            -0.101711
2001                0.218564            -0.080519
2002                0.225028            -0.155303
2003                0.242510            -0.149034
2004                0.227240            -0.111406
          compound_sentiment  sentiment_direction
pub_year                                         
2019                0.462930             0.185226
2020                1.022875             0.220867
2021                0.730834            -1.474510
2022                1.393760            -0.048648
2023                0.538934             0.258102


Unnamed: 0,Year,compound_sentiment,sentiment_direction,Adjusted 2022 Revenue in Billions USD
0,2000,0.226935,-0.101711,5.728217
1,2001,0.218564,-0.080519,4.99127
2,2002,0.225028,-0.155303,5.011206
3,2003,0.24251,-0.149034,5.138158
4,2004,0.22724,-0.111406,4.896411


##Data Visualization

In [None]:
#Histogram of Top Frequency Counts of Article Sections
fig = px.histogram(nyt, x='section_name',
                   labels={
                     "count": "Count",
                     "section_name": "Section Name",
                     },
                   title="Top Frequency Counts of Article Sections")
fig = fig.update_xaxes(categoryorder='total descending')
fig.show()

In [23]:
#Scatterplot of Revenue vs Compound Sentiment
fig2 = px.scatter(nyt_ml, x="compound_sentiment", y="Adjusted 2022 Revenue in Billions USD",
                 labels={
                     "compound_sentiment": "Compound Sentiment",
                     "Adjusted 2022 Revenue in Billions USD": "Adjusted 2022 Revenue (Billions USD)",
                 },
                title="Adjusted 2022 Revenue vs Compound Sentiment", hover_name = nyt_ml["Year"])
fig2.show()

In [24]:
#Scatterplot of Revenue vs Sentiment Direction
fig3 = px.scatter(nyt_ml, x="sentiment_direction", y="Adjusted 2022 Revenue in Billions USD",
                 labels={
                     "sentiment_direction": "Sentiment Direction",
                     "Adjusted 2022 Revenue in Billions USD": "Adjusted 2022 Revenue (Billions USD)",
                 },
                title="Adjusted 2022 Revenue vs Sentiment Direction", hover_name = nyt_ml["Year"])
fig3.show()

##Machine Learning

In [25]:
#Last ML data cleaning

#Keep columns relavent for ML
nyt_ml = nyt_ml.drop(columns=["Year"])

#Scale data
nyt_ml = nyt_ml.apply(lambda x: standard_units(x) if x.name in ["compound_sentiment","sentiment_direction"] else x)

In [26]:
#Divide into X & y
X = nyt_ml.drop(columns=["Adjusted 2022 Revenue in Billions USD"])
y = nyt_ml["Adjusted 2022 Revenue in Billions USD"]

In [27]:
#Find averages of MSE, RMSE, and beta coefficients for 1000 models - 6s runtime
mse_avg = 0
rmse_avg = 0
beta1_arr = []
beta2_arr = []
intercept_avg = 0
for i in range(1000):
  #Split data
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
  #Create and train model
  mult_reg = LinearRegression()
  mult_reg.fit(X_train,y_train)

  #Generate Predictions
  predicted = mult_reg.predict(X_test)
  actual = np.array(y_test)

  #MSE & RMSE
  mse = mean_squared_error(predicted,actual)
  mse_avg += mse
  rmse = mean_squared_error(predicted,actual,squared=False)
  rmse_avg += rmse

  #Beta Coefficients
  coef = mult_reg.coef_
  beta1_arr.append(coef[0])
  beta2_arr.append(coef[1])
  intercept = mult_reg.intercept_
  intercept_avg += intercept

mse_avg /= 1000
rmse_avg /= 1000
print("Average MSE is", round(mse_avg,5), "and Average RMSE is", round(rmse_avg,5))

Average MSE is 1.36578 and Average RMSE is 1.13747


In [28]:
#Average multiple linear regression equation
beta1_avg = np.mean(beta1_arr)
beta2_avg = np.mean(beta2_arr)
intercept_avg /= 1000

print("Adjusted 2022 Revenue in Billions USD =", round(beta1_avg,3),"*compound_sentiment +", round(beta2_avg,3),"*sentiment_direction +", round(intercept_avg,3))

Adjusted 2022 Revenue in Billions USD = -0.953 *compound_sentiment + -1.516 *sentiment_direction + 3.232


##Predictions

In [29]:
#2023 sentiment (previously recorded in "ML Dataset Creation" section)
sentiment2023

Unnamed: 0_level_0,compound_sentiment,sentiment_direction
pub_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2023,0.538934,0.258102


In [30]:
predicted2023Revenue = beta1_avg * sentiment2023["compound_sentiment"][2023] + beta2_avg * sentiment2023["sentiment_direction"][2023] + intercept_avg
print("The predicted adjusted 2022 revenue for 2023 is $", '{:,}'.format(round(predicted2023Revenue*1000000000,2)), u"\u00B1 $", '{:,}'.format(round(rmse_avg*1000000000,2)))

The predicted adjusted 2022 revenue for 2023 is $ 2,327,262,809.13 ± $ 1,137,467,839.72
