##### Reading data into df

In [60]:
import pandas as pd
import json
import os
from pathlib import Path

# Iterate through all articles in document
base_dir = Path('../../data/raw')
company = 'Apple_Inc'

def read_file(path):
    # Load content from json
    with open(path, 'r') as file:
        all_data = json.load(file)

    # Select response
    data = all_data['response']['docs']

    return data
    

def read_articles(data):

    column_list = ['pub_date','abstract','snippet','lead_para','headline','doc_type','section_name','type_of_material','rank','web_url']
    df = pd.DataFrame(columns=column_list)

    for article in data:
        # print(article)
        # Retrieve components
        pub_date = article['pub_date']

        abstract = article['abstract']
        snippet = article['snippet']
        lead_para = article['lead_paragraph']
        headline = article['headline']['main']

        doc_type = article['document_type']
        section_name = article['section_name']
        type_of_material = article.get('type_of_material', None) # will throw an error 

        keywords = article['keywords']
        rank = next((item['rank'] for item in keywords if item['name'] == 'organizations' and item['value'] == 'Apple Inc'), None) # next retrieves first matching rank, may not be necessary

        web_url = article['web_url']

        # Assign to new row in df
        df.loc[len(df)] = [pub_date,
                                    abstract,
                                    snippet,
                                    lead_para,
                                    headline,
                                    doc_type,
                                    section_name,
                                    type_of_material,
                                    rank,
                                    web_url]
    
    return df


# MAIN function below
def compile_company_df(base_dir, company):
    
    column_list = ['pub_date','abstract','snippet','lead_para','headline','doc_type','section_name','type_of_material','rank','web_url']
    df = pd.DataFrame(columns=column_list)

    # Run through directory files
    for year in range(2015, 2025):
        year_dir = base_dir / str(year) 

        for file in year_dir.rglob("*"):
            if file.is_file() and company in file.name:
                data = read_file(year_dir / file.name)
                temp_df = read_articles(data)
                df = pd.concat([df, temp_df], ignore_index=True)
    
    return df   

df = compile_company_df(base_dir, company)
# read_articles(data)
# read_file('../../data/raw/2015/organizations_Apple_Inc_mth01_pg0')
# data

##### Checking df

* Snippet and abstract are exactly the same in all except 6 cases
* Snippet has some "" (empty string) values

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2143 entries, 0 to 2142
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pub_date          2143 non-null   object
 1   abstract          2143 non-null   object
 2   snippet           2143 non-null   object
 3   lead_para         2143 non-null   object
 4   headline          2143 non-null   object
 5   doc_type          2143 non-null   object
 6   section_name      2143 non-null   object
 7   type_of_material  2142 non-null   object
 8   rank              2143 non-null   object
 9   web_url           2143 non-null   object
dtypes: object(10)
memory usage: 167.6+ KB


In [62]:
# Check data
df[df.abstract != df.snippet]

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
39,2015-02-12T05:05:35+0000,"App Pins aims to do for smartphone apps what Pinterest’s service has done for photos, recipes and many other types of websites. In short, the service is a type of digital corkboard that lets visitors save, or “pin,” items they like or places they want to go.","App Pins aims to do for smartphone apps what Pinterest’s service has done for photos, recipes and many other types of websites. In short, the service is a type of digital corkboard that lets visitors save, or “pin,” items they like or places they ...","There are more than 1.4 million mobile applications in Apple’s App Store. This is a good thing for Apple’s customers, because the choices are practically endless. This is also a bad thing for Apple’s customers, because the choices are practically endless.","Aiming to Ease App Discovery, Apple Pairs With Pinterest",article,Technology,News,2,https://bits.blogs.nytimes.com/2015/02/12/apple-pairs-with-pinterest-for-better-app-discovery/
276,2015-10-19T13:35:02+0000,"Tech earnings season is going into full swing, with companies including Yahoo, IBM, Google and Microsoft set to report quarterly results this week. Their financial data may offer insight into which companies are gaining the upper hand in some tech battles.","Tech earnings season is going into full swing, with companies including Yahoo, IBM, Google and Microsoft set to report quarterly results this week. Their financial data may offer insight into which companies are gaining the upper hand in some tech...",The technology industry’s financial data is set to go on full display this week.,Daily Report: Tech Tussles Take Center Stage in Earnings,article,Technology,News,4,https://bits.blogs.nytimes.com/2015/10/19/daily-report-tech-tussles-take-center-stage-in-earnings/
1568,2021-02-16T21:27:40+0000,A North Dakota bill that an Apple executive had warned “threatens to destroy iPhone as you know it” died in a vote on Tuesday.,,A North Dakota bill that an Apple executive had warned “threatens to destroy iPhone as you know it” died in a vote on Tuesday.,North Dakota lawmakers vote down a bill that threatened Apple’s and Google’s revenues.,article,Business Day,News,2,https://www.nytimes.com/2021/02/16/business/north-dakota-app-store-bill.html
1601,2021-05-03T11:31:09+0000,"Apple and Epic Games, maker of the wildly popular game Fortnite, are set to square off on Monday in a trial that could decide how much control Apple can exert over the app economy. The trial is scheduled to open with testimony from Tim Sweeney, the chief of Epic, on why he believes Apple is a monopoly abusing its power.",,"Apple and Epic Games, maker of the wildly popular game Fortnite, are set to square off on Monday in a trial that could decide how much control Apple can exert over the app economy. The trial is scheduled to open with testimony from Tim Sweeney, the chief of Epic, on why he believes Apple is a monopoly abusing its power.",Apple and Epic Games head to court over app revenue.,article,Business Day,News,5,https://www.nytimes.com/2021/05/03/business/apple-and-epic-games-head-to-court-over-app-revenue.html
1617,2021-05-27T12:21:50+0000,"When Apple and Google collaborated last year on a smartphone-based system to track the spread of the coronavirus, the news was seen as a game changer. The software uses Bluetooth signals to detect app users who come into close contact. If a user later tests positive, the person can anonymously notify other app users whom the person may have crossed paths with in restaurants, on trains or elsewhere.",,"When Apple and Google collaborated last year on a smartphone-based system to track the spread of the coronavirus, the news was seen as a game changer. The software uses Bluetooth signals to detect app users who come into close contact. If a user later tests positive, the person can anonymously notify other app users whom the person may have crossed paths with in restaurants, on trains or elsewhere.",Virus alert apps powered by Apple and Google have had limited success.,article,Business Day,News,5,https://www.nytimes.com/2021/05/27/business/virus-alert-apps-powered-by-apple-and-google-have-had-limited-success.html
1989,2023-10-23T17:21:54+0000,"In this 2017 internal Google document, an executive compares the search ads business to that of selling cigarettes or drugs. The executive testified at the trial that he produced the document during a communications training where he was practicing how to use hyperbole to get someone’s attention.","In this 2017 internal Google document, an executive compares the search ads business to that of selling cigarettes or drugs. The executive testified at the trial that he produced the document during a communications training where he was practicin...","In this 2017 internal Google document, an executive compares the search ads business to that of selling cigarettes or drugs. The executive testified at the trial that he produced the document during a communications training where he was practicing how to use hyperbole to get someone’s attention.",Read the document,multimedia,Business Day,Interactive Feature,5,https://www.nytimes.com/interactive/2023/10/23/business/google-trial-drug-cartel.html


In [63]:
df.iloc[176, 1:3].abstract


'A federal appeals court said it agreed with a 2013 decision that Apple conspired with publishers to raise prices for e-books.'

In [64]:
df.iloc[176, 1:3].snippet

'A federal appeals court said it agreed with a 2013 decision that Apple conspired with publishers to raise prices for e-books.'

In [65]:
#check for empty strings
(df == "").sum()

pub_date             0
abstract             0
snippet              3
lead_para           19
headline             0
doc_type             0
section_name         0
type_of_material     0
rank                 0
web_url              0
dtype: int64

In [66]:
df[df.snippet == ""]

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
1568,2021-02-16T21:27:40+0000,A North Dakota bill that an Apple executive had warned “threatens to destroy iPhone as you know it” died in a vote on Tuesday.,,A North Dakota bill that an Apple executive had warned “threatens to destroy iPhone as you know it” died in a vote on Tuesday.,North Dakota lawmakers vote down a bill that threatened Apple’s and Google’s revenues.,article,Business Day,News,2,https://www.nytimes.com/2021/02/16/business/north-dakota-app-store-bill.html
1601,2021-05-03T11:31:09+0000,"Apple and Epic Games, maker of the wildly popular game Fortnite, are set to square off on Monday in a trial that could decide how much control Apple can exert over the app economy. The trial is scheduled to open with testimony from Tim Sweeney, the chief of Epic, on why he believes Apple is a monopoly abusing its power.",,"Apple and Epic Games, maker of the wildly popular game Fortnite, are set to square off on Monday in a trial that could decide how much control Apple can exert over the app economy. The trial is scheduled to open with testimony from Tim Sweeney, the chief of Epic, on why he believes Apple is a monopoly abusing its power.",Apple and Epic Games head to court over app revenue.,article,Business Day,News,5,https://www.nytimes.com/2021/05/03/business/apple-and-epic-games-head-to-court-over-app-revenue.html
1617,2021-05-27T12:21:50+0000,"When Apple and Google collaborated last year on a smartphone-based system to track the spread of the coronavirus, the news was seen as a game changer. The software uses Bluetooth signals to detect app users who come into close contact. If a user later tests positive, the person can anonymously notify other app users whom the person may have crossed paths with in restaurants, on trains or elsewhere.",,"When Apple and Google collaborated last year on a smartphone-based system to track the spread of the coronavirus, the news was seen as a game changer. The software uses Bluetooth signals to detect app users who come into close contact. If a user later tests positive, the person can anonymously notify other app users whom the person may have crossed paths with in restaurants, on trains or elsewhere.",Virus alert apps powered by Apple and Google have had limited success.,article,Business Day,News,5,https://www.nytimes.com/2021/05/27/business/virus-alert-apps-powered-by-apple-and-google-have-had-limited-success.html


In [67]:
# 19 lead_paras blank
df[df.lead_para == ""] # should we take them out?

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
8,2015-01-02T17:09:23+0000,"You can adjust your settings so that only certain people can see your images. Also, syncing files between Macs and PCs.","You can adjust your settings so that only certain people can see your images. Also, syncing files between Macs and PCs.",,Sharing Flickr Photos Privately,article,Technology,Question,5,https://www.nytimes.com/2015/01/02/technology/personaltech/sharing-flickr-photos-privately.html
26,2015-02-27T15:06:31+0000,"Learn about the differences between Amazon Fire TV and Fire TV stick (besides price), and also discover how to transfer airline boarding passes between iPhones.","Learn about the differences between Amazon Fire TV and Fire TV stick (besides price), and also discover how to transfer airline boarding passes between iPhones.",,Choosing Between Amazon’s TV Streaming Hardware,article,Technology,Question,4,https://www.nytimes.com/2015/02/27/technology/personaltech/choosing-between-amazons-tv-streaming-hardware.html
29,2015-02-11T14:35:37+0000,"Tips on how to transfer photographs into iPhoto on your Mac, and advice for connecting to a virtual private network from a mobile device.","Tips on how to transfer photographs into iPhoto on your Mac, and advice for connecting to a virtual private network from a mobile device.",,Importing Images to iPhoto,article,Technology,Question,12,https://www.nytimes.com/2015/02/12/technology/personaltech/importing-images-to-iphoto.html
67,2015-03-02T17:55:05+0000,The Internal Revenue Service is not emailing you to request personal information. Not now. Not ever.,The Internal Revenue Service is not emailing you to request personal information. Not now. Not ever.,,The Tax Con Cometh,article,Technology,Question,4,https://www.nytimes.com/2015/03/05/technology/personaltech/the-tax-scam-cometh.html
100,2015-04-17T14:07:02+0000,"Apps will allow users to peek at their balance and recent activity, but not to pay bills or make deposits.","Apps will allow users to peek at their balance and recent activity, but not to pay bills or make deposits.",,"Banking on Apple Watch Will Be Limited, for Now",article,Your Money,News,3,https://www.nytimes.com/2015/04/17/your-money/banking-on-apple-watch-will-be-limited-for-now.html
108,2015-04-03T13:42:14+0000,The inexpensive device links a standard audio cable from the music player to a computer’s USB jack.,The inexpensive device links a standard audio cable from the music player to a computer’s USB jack.,,Converting Audio From a Cassette to MP3 Requires an Adapter,article,Technology,Question,3,https://www.nytimes.com/2015/04/03/technology/personaltech/converting-audio-from-a-cassette-to-mp3-requires-an-adapter.html
116,2015-05-22T13:14:57+0000,You can send the message to someone not in the original conversation by taking a couple of steps.,You can send the message to someone not in the original conversation by taking a couple of steps.,,Forwarding Text Messages on the iPhone,article,Technology,Question,6,https://www.nytimes.com/2015/05/23/technology/personaltech/23askkextra.html
153,2015-06-12T14:07:31+0000,"Tips for mirroring your phone’s display on a Mac, and how to get desktop notifications from Gmail.","Tips for mirroring your phone’s display on a Mac, and how to get desktop notifications from Gmail.",,Recording a Phone’s Screen on a Mac,article,Technology,Question,1,https://www.nytimes.com/2015/06/13/technology/personaltech/recording-a-phones-screen-on-a-mac.html
167,2015-06-10T15:45:58+0000,Some independent booksellers gain a portion of e-book sales through partnerships with bigger e-bookstores like Kobo and Amazon Source.,Some independent booksellers gain a portion of e-book sales through partnerships with bigger e-bookstores like Kobo and Amazon Source.,,"Shop Locally, Download Globally",article,Technology,Question,4,https://www.nytimes.com/2015/06/11/technology/personaltech/shop-locally-download-globally.html
188,2015-07-15T17:56:34+0000,"How to rid the bookmarks menu of a utility you don’t use. Plus, browsing with Chrome on Windows 10 and learning MacBook trackpad gestures.","How to rid the bookmarks menu of a utility you don’t use. Plus, browsing with Chrome on Windows 10 and learning MacBook trackpad gestures.",,Picking Pockets Off the Firefox Bookmarks List,article,Technology,Question,7,https://www.nytimes.com/2015/07/16/technology/personaltech/picking-pockets-off-the-firefox-bookmarks-list.html


In [68]:
# Reformat the date
df['pub_date'] = pd.to_datetime(df['pub_date']) 
df['pub_date'] = df['pub_date'].dt.date

print(df[['pub_date']])  # Display result


        pub_date
0     2015-01-05
1     2015-01-28
2     2015-01-21
3     2015-01-08
4     2015-01-28
...          ...
2138  2024-12-08
2139  2024-12-17
2140  2024-12-13
2141  2024-12-18
2142  2024-12-07

[2143 rows x 1 columns]


In [69]:
pd.set_option('display.max_colwidth', None)
print(df['headline']) # most headlines are not meaningful

0            What to Watch For in 2015: Galliano and Gucci, Elections and Apple
1                               New Rules in China Upset Western Tech Companies
2                              Apple Acquires Semetric, a Data Analysis Company
3                       Apple’s Cut From App Sales Reached $4.5 Billion in 2014
4                    Daily Report: IPhone Sales in China Bolster Apple Earnings
                                         ...                                   
2138    Apple Sued for Failing to Curtail Child Sexual Abuse Material on iCloud
2139                                                     The Great Capitulation
2140                The Messy Modern Music Business, According to Larry Jackson
2141                        How to Find Your Way Around That Updated Photos App
2142       On These Apps, the Dark Promise of Mothers Sexually Abusing Children
Name: headline, Length: 2143, dtype: object


In [70]:
print(df['lead_para']) # lead para does not seem entirely useful

0                                                                                                                                                                    It is officially the first full week of the new year, and you know what that means: time to take note of what to watch for in 2015. I think it’s going to be a doozy. Why? Well, here’s what I have at the top of my agenda:
1       HONG KONG — The Chinese government has adopted new regulations requiring companies that sell computer equipment to Chinese banks to turn over secret source code, submit to invasive audits and build so-called back doors into hardware and software, according to a copy of the rules obtained by foreign technology companies that do billions of dollars’ worth of business in China.
2                                                                                                                                                                                                                        Apple has a

#### Misc Testing Code

##### Processing data (testing 1 file)

In [7]:
# Read data from raw files
# testing with one file

import json

# Load content from json
with open('../../data/raw/2015/organizations_Apple_Inc_mth01_pg0', 'r') as file:
    data = json.load(file)

# Print data
print(data)
print(len(data))


{'status': 'OK', 'copyright': 'Copyright (c) 2025 The New York Times Company. All Rights Reserved.', 'response': {'docs': [{'abstract': 'It’s time to take note of what lies at the top of the fashion agenda for 2015.', 'web_url': 'https://runway.blogs.nytimes.com/2015/01/05/what-to-watch-for-in-2015-galliano-and-gucci-elections-and-apple/', 'snippet': 'It’s time to take note of what lies at the top of the fashion agenda for 2015.', 'lead_paragraph': 'It is officially the first full week of the new year, and you know what that means: time to take note of what to watch for in 2015. I think it’s going to be a doozy. Why? Well, here’s what I have at the top of my agenda:', 'source': 'The New York Times', 'multimedia': [{'rank': 0, 'subtype': 'wide', 'caption': None, 'credit': None, 'type': 'image', 'url': 'images/2015/01/05/fashion/05runway/05runway-thumbWide.jpg', 'height': 126, 'width': 190, 'legacy': {'widewidth': 190, 'wideheight': 126, 'wide': 'images/2015/01/05/fashion/05runway/05runw

In [30]:
data["response"]['docs']

[{'abstract': 'It’s time to take note of what lies at the top of the fashion agenda for 2015.',
  'web_url': 'https://runway.blogs.nytimes.com/2015/01/05/what-to-watch-for-in-2015-galliano-and-gucci-elections-and-apple/',
  'snippet': 'It’s time to take note of what lies at the top of the fashion agenda for 2015.',
  'lead_paragraph': 'It is officially the first full week of the new year, and you know what that means: time to take note of what to watch for in 2015. I think it’s going to be a doozy. Why? Well, here’s what I have at the top of my agenda:',
  'source': 'The New York Times',
  'multimedia': [{'rank': 0,
    'subtype': 'wide',
    'caption': None,
    'credit': None,
    'type': 'image',
    'url': 'images/2015/01/05/fashion/05runway/05runway-thumbWide.jpg',
    'height': 126,
    'width': 190,
    'legacy': {'widewidth': 190,
     'wideheight': 126,
     'wide': 'images/2015/01/05/fashion/05runway/05runway-thumbWide.jpg'},
    'subType': 'wide',
    'crop_name': 'thumbWide

In [8]:
# check length 
len(data['response']['docs'])

10

In [9]:
# Need to extract abstract, web_url, snippet, lead_paragraph, pub_date, document_type, section_name, type_of_material, rank (?)
article = data['response']['docs'][0]

# Testing retrieval of components
pub_date = article['pub_date']

abstract = article['abstract']
snippet = article['snippet']
lead_para = article['lead_paragraph']
headline = article['headline']['main']

doc_type = article['document_type']
section_name = article['section_name']
type_of_material = article['type_of_material']

# Find the rank of Apple Inc keyword in article: gauge of relevance 
keywords = article['keywords']
rank = next((item['rank'] for item in keywords if item['name'] == 'organizations' and item['value'] == 'Apple Inc'), None) # next retrieves first matching rank, may not be necessary

web_url = article['web_url']

In [10]:
pub_date

'2015-01-05T14:03:03+0000'

In [None]:
df

Labelling '1' if headline and snippet contain Apple 


In [71]:
df['apple_label'] = df.apply(lambda x: 1 if 'Apple' in str(x['snippet']) and 'Apple' in str(x['headline']) else 0, axis=1)
df.head()

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url,apple_label
0,2015-01-05,It’s time to take note of what lies at the top of the fashion agenda for 2015.,It’s time to take note of what lies at the top of the fashion agenda for 2015.,"It is officially the first full week of the new year, and you know what that means: time to take note of what to watch for in 2015. I think it’s going to be a doozy. Why? Well, here’s what I have at the top of my agenda:","What to Watch For in 2015: Galliano and Gucci, Elections and Apple",article,Fashion & Style,News,20,https://runway.blogs.nytimes.com/2015/01/05/what-to-watch-for-in-2015-galliano-and-gucci-elections-and-apple/,0
1,2015-01-28,"Stringent regulations, including a requirement to share secret codes, are another form of economic protectionism, Western technology companies say.","Stringent regulations, including a requirement to share secret codes, are another form of economic protectionism, Western technology companies say.","HONG KONG — The Chinese government has adopted new regulations requiring companies that sell computer equipment to Chinese banks to turn over secret source code, submit to invasive audits and build so-called back doors into hardware and software, according to a copy of the rules obtained by foreign technology companies that do billions of dollars’ worth of business in China.",New Rules in China Upset Western Tech Companies,article,Technology,News,8,https://www.nytimes.com/2015/01/29/technology/in-china-new-cybersecurity-rules-perturb-western-tech-companies.html,0
2,2015-01-21,"Semetric, based in Britain, is one of a growing number of companies that record labels and others consult for data on how music is consumed online.","Semetric, based in Britain, is one of a growing number of companies that record labels and others consult for data on how music is consumed online.","Apple has acquired Semetric, a company that analyzes data about music online, a move that may hint at Apple’s ambitions as it prepares to revamp iTunes and Beats Music.","Apple Acquires Semetric, a Data Analysis Company",article,Technology,News,2,https://www.nytimes.com/2015/01/22/business/apple-acquires-semetric-a-data-analysis-company.html,0
3,2015-01-08,"Google’s app store is the largest in the world, but still makes far less money than the Apple App Store.","Google’s app store is the largest in the world, but still makes far less money than the Apple App Store.","Apple continues to make billions of dollars from mobile apps sold in its App Store. The company said on Thursday that in 2014, billings from app sales rose 50 percent from the previous year, which results in roughly $15 billion in revenue for app developers and a $4.5 billion cut for Apple.",Apple’s Cut From App Sales Reached $4.5 Billion in 2014,article,Technology,News,4,https://bits.blogs.nytimes.com/2015/01/08/apples-cut-from-app-sales-reached-4-5-billion-in-2014/,1
4,2015-01-28,The introduction of a large-screen iPhone in China helped propel Apple’s profit to $18 billion and its revenue to nearly $75 billion.,The introduction of a large-screen iPhone in China helped propel Apple’s profit to $18 billion and its revenue to nearly $75 billion.,Apple is famous for setting trends.,Daily Report: IPhone Sales in China Bolster Apple Earnings,article,Technology,News,3,https://bits.blogs.nytimes.com/2015/01/28/daily-report-iphone-sales-in-china-bolster-apple-earnings/,1


In [72]:
df[df.apple_label==1].value_counts()

pub_date    abstract                                                                                                                                                                    snippet                                                                                                                                                                     lead_para                                                                                                                                                                                                                                                                                                                  headline                                                                       doc_type  section_name  type_of_material  rank  web_url                                                                                                      apple_label
2015-01-08  Google’s app store is the largest in the world, but still makes far l

Sentiment Analysis (NLTK, VADER, SentiMo)

We shall use the Headline + Snippet to analyse for Sentiment

In [77]:
sentiment_df=df #save the df first 

NLTK Vader

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
def get_sentiment(text):
    if pd.isna(text):  # Handle NaN values
        return 0  # Neutral score for missing text
    return sia.polarity_scores(text)['compound']
x = sentiment_df['headline'].apply(get_sentiment)
y = sentiment_df['snippet'].apply(get_sentiment)
sentiment_df['NLTK_sentiment'] = (x+y)/2

def classify_sentiment(score):
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

sentiment_df['sentiment_label'] = sentiment_df['NLTK_sentiment'].apply(classify_sentiment)
sentiment_df.head()



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\leyon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url,apple_label,NLTK_sentiment,sentiment_label
0,2015-01-05,It’s time to take note of what lies at the top of the fashion agenda for 2015.,It’s time to take note of what lies at the top of the fashion agenda for 2015.,"It is officially the first full week of the new year, and you know what that means: time to take note of what to watch for in 2015. I think it’s going to be a doozy. Why? Well, here’s what I have at the top of my agenda:","What to Watch For in 2015: Galliano and Gucci, Elections and Apple",article,Fashion & Style,News,20,https://runway.blogs.nytimes.com/2015/01/05/what-to-watch-for-in-2015-galliano-and-gucci-elections-and-apple/,0,-0.12500,Negative
1,2015-01-28,"Stringent regulations, including a requirement to share secret codes, are another form of economic protectionism, Western technology companies say.","Stringent regulations, including a requirement to share secret codes, are another form of economic protectionism, Western technology companies say.","HONG KONG — The Chinese government has adopted new regulations requiring companies that sell computer equipment to Chinese banks to turn over secret source code, submit to invasive audits and build so-called back doors into hardware and software, according to a copy of the rules obtained by foreign technology companies that do billions of dollars’ worth of business in China.",New Rules in China Upset Western Tech Companies,article,Technology,News,8,https://www.nytimes.com/2015/01/29/technology/in-china-new-cybersecurity-rules-perturb-western-tech-companies.html,0,-0.04290,Neutral
2,2015-01-21,"Semetric, based in Britain, is one of a growing number of companies that record labels and others consult for data on how music is consumed online.","Semetric, based in Britain, is one of a growing number of companies that record labels and others consult for data on how music is consumed online.","Apple has acquired Semetric, a company that analyzes data about music online, a move that may hint at Apple’s ambitions as it prepares to revamp iTunes and Beats Music.","Apple Acquires Semetric, a Data Analysis Company",article,Technology,News,2,https://www.nytimes.com/2015/01/22/business/apple-acquires-semetric-a-data-analysis-company.html,0,0.12500,Positive
3,2015-01-08,"Google’s app store is the largest in the world, but still makes far less money than the Apple App Store.","Google’s app store is the largest in the world, but still makes far less money than the Apple App Store.","Apple continues to make billions of dollars from mobile apps sold in its App Store. The company said on Thursday that in 2014, billings from app sales rose 50 percent from the previous year, which results in roughly $15 billion in revenue for app developers and a $4.5 billion cut for Apple.",Apple’s Cut From App Sales Reached $4.5 Billion in 2014,article,Technology,News,4,https://bits.blogs.nytimes.com/2015/01/08/apples-cut-from-app-sales-reached-4-5-billion-in-2014/,1,-0.08895,Neutral
4,2015-01-28,The introduction of a large-screen iPhone in China helped propel Apple’s profit to $18 billion and its revenue to nearly $75 billion.,The introduction of a large-screen iPhone in China helped propel Apple’s profit to $18 billion and its revenue to nearly $75 billion.,Apple is famous for setting trends.,Daily Report: IPhone Sales in China Bolster Apple Earnings,article,Technology,News,3,https://bits.blogs.nytimes.com/2015/01/28/daily-report-iphone-sales-in-china-bolster-apple-earnings/,1,0.22020,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2138,2024-12-08,"Victims of abuse are seeking more than $1.2 billion in damages, arguing that the company abandoned a 2021 system it developed to find abusive material.","Victims of abuse are seeking more than $1.2 billion in damages, arguing that the company abandoned a 2021 system it developed to find abusive material.","The abuse began when she was still an infant. A relative molested her, took photographs and swapped the images with others online. He allowed another man to spend time with her, multiplying the abuse.",Apple Sued for Failing to Curtail Child Sexual Abuse Material on iCloud,article,Technology,News,7,https://www.nytimes.com/2024/12/08/technology/apple-child-sexual-abuse-material-lawsuit.html,0,-0.88970,Negative
2139,2024-12-17,Powerful Americans have lost the will to resist Trump.,Powerful Americans have lost the will to resist Trump.,"At a press conference at Mar-a-Lago on Monday, Donald Trump described recent visits from Tim Cook, C.E.O. of Apple, Sergey Brin, a co-founder of Google, and other tech barons. “In the first term, everyone was fighting me,” he said. “In this term, everyone wants to be my friend.” For once, he wasn’t exaggerating.",The Great Capitulation,article,Opinion,Op-Ed,8,https://www.nytimes.com/2024/12/16/opinion/trump-tech-leaders-support.html,0,0.37645,Positive
2140,2024-12-13,"On this week’s episode of Popcast, an interview with a record label veteran who has worked closely with Whitney Houston, Drake, Chief Keef and Lana Del Rey.","On this week’s episode of Popcast, an interview with a record label veteran who has worked closely with Whitney Houston, Drake, Chief Keef and Lana Del Rey.","The music executive Larry Jackson, a founder of the entertainment company Gamma, has seen several sea changes in the recording business from different vantages over different eras of disruption.","The Messy Modern Music Business, According to Larry Jackson",article,Arts,News,5,https://www.nytimes.com/2024/12/13/arts/music/larry-jackson-popcast-interview-drake.html,0,-0.18060,Negative
2141,2024-12-18,"Redesigned interfaces, new features and artificial intelligence are meant to enhance the experience, even if the changes can be confusing at first.","Redesigned interfaces, new features and artificial intelligence are meant to enhance the experience, even if the changes can be confusing at first.","Apple’s fall overhaul of its Photos app — publicized by the company as its “biggest redesign ever” — gave the software a fresh look and new methods for managing your portable picture library on the iPhone and iPad. However, not everyone was a fan, as the new design retired familiar navigational icons in favor of a “unified” view that put almost everything on one screen.",How to Find Your Way Around That Updated Photos App,article,Technology,News,11,https://www.nytimes.com/2024/12/18/technology/personaltech/apple-google-photo-apps-updates-changes.html,0,0.14800,Positive


Using Transformer

In [None]:
# ! pip install tensorflow
# ! pip install transformers
# ! conda update tensorflow keras





PackageNotInstalledError: Package is not installed in prefix.
  prefix: C:\Users\leyon\anaconda3\envs\new_etlenv
  package name: tensorflow




In [87]:
# testing
from transformers import pipeline
import tensorflow as tf
classifier = pipeline("sentiment-analysis")
print(classifier("Apple's stock is performing great!"))


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998513460159302}]


In [None]:


# Extract labels and scores into separate columns
sentiment_df['headline_label'] = sentiment_df['headline'].apply(lambda x: classifier(x)[0]['label'])
sentiment_df['headline_score'] = sentiment_df['headline'].apply(lambda x: classifier(x)[0]['score'])

sentiment_df['snippet_label'] = sentiment_df['snippet'].apply(lambda x: classifier(x)[0]['label'])
sentiment_df['snippet_score'] = sentiment_df['snippet'].apply(lambda x: classifier(x)[0]['score'])


def classify_sentiment(row):
    # If both labels are the same, use that label
    if row['headline_label'] == row['snippet_label']:
        return row['headline_label']
    # If scores are different, take the one with the higher score
    elif row['headline_score'] < row['snippet_score']:
        return row['snippet_label']
    else:
        return row['headline_label']


sentiment_df['Transformer_overall_sentiment'] = sentiment_df.apply(classify_sentiment, axis=1)


sentiment_df # will take a while 

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url,apple_label,NLTK_sentiment,sentiment_label,Transformer_sentiment,Transformer_overall_sentiment,headline_label,headline_score,snippet_label,snippet_score
0,2015-01-05,It’s time to take note of what lies at the top of the fashion agenda for 2015.,It’s time to take note of what lies at the top of the fashion agenda for 2015.,"It is officially the first full week of the new year, and you know what that means: time to take note of what to watch for in 2015. I think it’s going to be a doozy. Why? Well, here’s what I have at the top of my agenda:","What to Watch For in 2015: Galliano and Gucci, Elections and Apple",article,Fashion & Style,News,20,https://runway.blogs.nytimes.com/2015/01/05/what-to-watch-for-in-2015-galliano-and-gucci-elections-and-apple/,0,-0.12500,Negative,0.953290,POSITIVE,POSITIVE,0.999132,POSITIVE,0.907448
1,2015-01-28,"Stringent regulations, including a requirement to share secret codes, are another form of economic protectionism, Western technology companies say.","Stringent regulations, including a requirement to share secret codes, are another form of economic protectionism, Western technology companies say.","HONG KONG — The Chinese government has adopted new regulations requiring companies that sell computer equipment to Chinese banks to turn over secret source code, submit to invasive audits and build so-called back doors into hardware and software, according to a copy of the rules obtained by foreign technology companies that do billions of dollars’ worth of business in China.",New Rules in China Upset Western Tech Companies,article,Technology,News,8,https://www.nytimes.com/2015/01/29/technology/in-china-new-cybersecurity-rules-perturb-western-tech-companies.html,0,-0.04290,Neutral,0.980202,NEGATIVE,NEGATIVE,0.995439,NEGATIVE,0.964965
2,2015-01-21,"Semetric, based in Britain, is one of a growing number of companies that record labels and others consult for data on how music is consumed online.","Semetric, based in Britain, is one of a growing number of companies that record labels and others consult for data on how music is consumed online.","Apple has acquired Semetric, a company that analyzes data about music online, a move that may hint at Apple’s ambitions as it prepares to revamp iTunes and Beats Music.","Apple Acquires Semetric, a Data Analysis Company",article,Technology,News,2,https://www.nytimes.com/2015/01/22/business/apple-acquires-semetric-a-data-analysis-company.html,0,0.12500,Positive,0.972061,POSITIVE,POSITIVE,0.952105,POSITIVE,0.992017
3,2015-01-08,"Google’s app store is the largest in the world, but still makes far less money than the Apple App Store.","Google’s app store is the largest in the world, but still makes far less money than the Apple App Store.","Apple continues to make billions of dollars from mobile apps sold in its App Store. The company said on Thursday that in 2014, billings from app sales rose 50 percent from the previous year, which results in roughly $15 billion in revenue for app developers and a $4.5 billion cut for Apple.",Apple’s Cut From App Sales Reached $4.5 Billion in 2014,article,Technology,News,4,https://bits.blogs.nytimes.com/2015/01/08/apples-cut-from-app-sales-reached-4-5-billion-in-2014/,1,-0.08895,Neutral,0.998292,NEGATIVE,NEGATIVE,0.997423,NEGATIVE,0.999162
4,2015-01-28,The introduction of a large-screen iPhone in China helped propel Apple’s profit to $18 billion and its revenue to nearly $75 billion.,The introduction of a large-screen iPhone in China helped propel Apple’s profit to $18 billion and its revenue to nearly $75 billion.,Apple is famous for setting trends.,Daily Report: IPhone Sales in China Bolster Apple Earnings,article,Technology,News,3,https://bits.blogs.nytimes.com/2015/01/28/daily-report-iphone-sales-in-china-bolster-apple-earnings/,1,0.22020,Positive,0.987522,POSITIVE,POSITIVE,0.999503,POSITIVE,0.975542
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2138,2024-12-08,"Victims of abuse are seeking more than $1.2 billion in damages, arguing that the company abandoned a 2021 system it developed to find abusive material.","Victims of abuse are seeking more than $1.2 billion in damages, arguing that the company abandoned a 2021 system it developed to find abusive material.","The abuse began when she was still an infant. A relative molested her, took photographs and swapped the images with others online. He allowed another man to spend time with her, multiplying the abuse.",Apple Sued for Failing to Curtail Child Sexual Abuse Material on iCloud,article,Technology,News,7,https://www.nytimes.com/2024/12/08/technology/apple-child-sexual-abuse-material-lawsuit.html,0,-0.88970,Negative,0.990418,NEGATIVE,NEGATIVE,0.981824,NEGATIVE,0.999012
2139,2024-12-17,Powerful Americans have lost the will to resist Trump.,Powerful Americans have lost the will to resist Trump.,"At a press conference at Mar-a-Lago on Monday, Donald Trump described recent visits from Tim Cook, C.E.O. of Apple, Sergey Brin, a co-founder of Google, and other tech barons. “In the first term, everyone was fighting me,” he said. “In this term, everyone wants to be my friend.” For once, he wasn’t exaggerating.",The Great Capitulation,article,Opinion,Op-Ed,8,https://www.nytimes.com/2024/12/16/opinion/trump-tech-leaders-support.html,0,0.37645,Positive,0.988986,NEGATIVE,NEGATIVE,0.991702,NEGATIVE,0.986271
2140,2024-12-13,"On this week’s episode of Popcast, an interview with a record label veteran who has worked closely with Whitney Houston, Drake, Chief Keef and Lana Del Rey.","On this week’s episode of Popcast, an interview with a record label veteran who has worked closely with Whitney Houston, Drake, Chief Keef and Lana Del Rey.","The music executive Larry Jackson, a founder of the entertainment company Gamma, has seen several sea changes in the recording business from different vantages over different eras of disruption.","The Messy Modern Music Business, According to Larry Jackson",article,Arts,News,5,https://www.nytimes.com/2024/12/13/arts/music/larry-jackson-popcast-interview-drake.html,0,-0.18060,Negative,0.968157,NEGATIVE,NEGATIVE,0.998757,POSITIVE,0.937556
2141,2024-12-18,"Redesigned interfaces, new features and artificial intelligence are meant to enhance the experience, even if the changes can be confusing at first.","Redesigned interfaces, new features and artificial intelligence are meant to enhance the experience, even if the changes can be confusing at first.","Apple’s fall overhaul of its Photos app — publicized by the company as its “biggest redesign ever” — gave the software a fresh look and new methods for managing your portable picture library on the iPhone and iPad. However, not everyone was a fan, as the new design retired familiar navigational icons in favor of a “unified” view that put almost everything on one screen.",How to Find Your Way Around That Updated Photos App,article,Technology,News,11,https://www.nytimes.com/2024/12/18/technology/personaltech/apple-google-photo-apps-updates-changes.html,0,0.14800,Positive,0.874199,POSITIVE,NEGATIVE,0.754618,POSITIVE,0.993781


##### Renaming files

In [5]:

import os
from pathlib import Path

base_dir = Path('../../data/raw')

# Loop through years
for year in range(2015, 2025):
    year_dir = base_dir / str(year)  # Construct path for each year

    # Traverse all files in the year directory
    for file in year_dir.rglob("*"):  # finds all files and subdirectories
        if file.is_file() and " " in file.name:
            new_name = file.name.replace(" ", "_")  
            new_path = file.with_name(new_name)  # Create new path
            file.rename(new_path)  # Rename the file
            print(f"Renamed: {file} -> {new_path}")  # Optional log output

Renamed: ../../data/raw/2015/organizations_Apple Inc_mth09_pg1 -> ../../data/raw/2015/organizations_Apple_Inc_mth09_pg1
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth08_pg1 -> ../../data/raw/2015/organizations_Apple_Inc_mth08_pg1
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth09_pg0 -> ../../data/raw/2015/organizations_Apple_Inc_mth09_pg0
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth08_pg0 -> ../../data/raw/2015/organizations_Apple_Inc_mth08_pg0
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth01_pg2 -> ../../data/raw/2015/organizations_Apple_Inc_mth01_pg2
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth03_pg3 -> ../../data/raw/2015/organizations_Apple_Inc_mth03_pg3
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth03_pg2 -> ../../data/raw/2015/organizations_Apple_Inc_mth03_pg2
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth04_pg3 -> ../../data/raw/2015/organizations_Apple_Inc_mth04_pg3
Renamed: ../../data/raw/2015/organizatio