### Data Processing

#### Reading JSON data into df

In [193]:
import pandas as pd
import json
import os
from pathlib import Path

# Iterate through all articles in document
base_dir = Path('../../data/raw')
company = 'Apple_Inc'

def read_file(path):
    # Load content from json
    with open(path, 'r') as file:
        all_data = json.load(file)

    # Select response
    data = all_data['response']['docs']

    return data
    

def read_articles(data):

    column_list = ['pub_date','abstract','snippet','lead_para','headline','doc_type','section_name','type_of_material','rank','web_url']
    df = pd.DataFrame(columns=column_list)

    for article in data:
        # print(article)
        # Retrieve components
        pub_date = article['pub_date']

        abstract = article['abstract']
        snippet = article['snippet']
        lead_para = article['lead_paragraph']
        headline = article['headline']['main']

        doc_type = article['document_type']
        section_name = article['section_name']
        type_of_material = article.get('type_of_material', None) # will throw an error 

        keywords = article['keywords']
        rank = next((item['rank'] for item in keywords if item['name'] == 'organizations' and item['value'] == 'Apple Inc'), None) # next retrieves first matching rank, may not be necessary

        web_url = article['web_url']

        # Assign to new row in df
        df.loc[len(df)] = [pub_date,
                                    abstract,
                                    snippet,
                                    lead_para,
                                    headline,
                                    doc_type,
                                    section_name,
                                    type_of_material,
                                    rank,
                                    web_url]
    
    return df


# MAIN function below
def compile_company_df(base_dir, company):
    
    column_list = ['pub_date','abstract','snippet','lead_para','headline','doc_type','section_name','type_of_material','rank','web_url']
    df = pd.DataFrame(columns=column_list)

    # Run through directory files
    for year in range(2015, 2025):
        year_dir = base_dir / str(year) 

        for file in year_dir.rglob("*"):
            if file.is_file() and company in file.name:
                data = read_file(year_dir / file.name)
                temp_df = read_articles(data)
                df = pd.concat([df, temp_df], ignore_index=True)
    
    return df   

df = compile_company_df(base_dir, company)
# read_articles(data)
# read_file('../../data/raw/2015/organizations_Apple_Inc_mth01_pg0')
# data

#### Checking df

Findings:
<br><br>
(1) Use abstract instead of snippet
* Snippet and abstract are exactly the same in all except 6 cases + abstract is more complete
* Snippet has 3 "" (empty string) values
<br><br>

(2) Consider either abstract OR lead para 
* Abstract and lead para frequently have some overlaps
* Lead para is often longer/more detailed, but may not summarise the key point
<br><br>

(3) Null/empty string values
* Lead para has 19 "" (empty string) values
* Type of material has 1 null value
<br><br>

(4) No duplicates
* Only repeats in headline/abstract occasionally, published on diff dates (regularly published column)
<br><br>

(Using Apple Inc as an example)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2143 entries, 0 to 2142
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pub_date          2143 non-null   object
 1   abstract          2143 non-null   object
 2   snippet           2143 non-null   object
 3   lead_para         2143 non-null   object
 4   headline          2143 non-null   object
 5   doc_type          2143 non-null   object
 6   section_name      2143 non-null   object
 7   type_of_material  2142 non-null   object
 8   rank              2143 non-null   object
 9   web_url           2143 non-null   object
dtypes: object(10)
memory usage: 167.6+ KB


In [29]:
# Check if abstract == snippet
df[df.abstract != df.snippet].shape

(6, 10)

In [31]:
# Check if lead para == abstract
df[df.abstract != df.lead_para].shape
df[df.abstract != df.lead_para].head(10)


Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
2,2015-03-10T00:38:04+0000,"The Apple Watch is good looking for a smartwatch and not hard to use, according to initial impressions. But who knows if it is really a necessary purchase.","The Apple Watch is good looking for a smartwatch and not hard to use, according to initial impressions. But who knows if it is really a necessary purchase.","SAN FRANCISCO — When Apple unveiled its watch last fall, the company showed only demo models of the new device — polished prototypes of the hardware running nonworking loops of the software.","Apple Watch Displays Your Digital World, at a Glance",article,Technology,News,5,https://www.nytimes.com/2015/03/10/technology/personaltech/apple-watch-displays-your-digital-world-at-a-glance.html
3,2015-03-06T17:47:46+0000,Third-party apps and recent operating system updates are often the cause; starting the phone in Safe Mode can help narrow down the search for a culprit.,Third-party apps and recent operating system updates are often the cause; starting the phone in Safe Mode can help narrow down the search for a culprit.,Q. My Android phone just started freezing and crashing a few days ago. What would cause it to do so?,Troubleshooting Android in Safe Mode,article,Technology,Question,3,https://www.nytimes.com/2015/03/06/technology/personaltech/troubleshooting-android-in-safe-mode.html
7,2015-03-31T09:30:08+0000,A portrait of a volatile boy wonder and his path to technological vanguard.,A portrait of a volatile boy wonder and his path to technological vanguard.,"In early 2009, Tim Cook presented Steve Jobs, his cancer-stricken mentor and friend, with a surprise offer: Cook wanted to donate a portion of his own liver to his ailing boss, who was stuck in dangerous limbo on California’s waiting list for liver transplants.","‘Becoming Steve Jobs,’ by Brent Schlender and Rick Tetzeli",article,Books,Review,6,https://www.nytimes.com/2015/04/05/books/review/becoming-steve-jobs-by-brent-schlender-and-rick-tetzeli.html
8,2015-03-10T10:32:25+0000,"Initial impressions of the Apple Watch are that it is good-looking for a smart watch and not hard to use, but whether it is yet a necessary purchase remains unsettled.","Initial impressions of the Apple Watch are that it is good-looking for a smart watch and not hard to use, but whether it is yet a necessary purchase remains unsettled.","On Monday, Apple allowed a closer look at its much anticipated watch. Farhad Manjoo gives his impressions of the timepiece on three fronts: the hardware, the interface and the watch’s uses.",Daily Report: Using the Apple Watch,article,Technology,News,3,https://bits.blogs.nytimes.com/2015/03/10/daily-report-using-the-apple-watch/
9,2015-03-25T22:06:06+0000,"“Becoming Steve Jobs,” a new biography by Brent Schlender and Rick Tetzeli, focuses on the period between Mr. Jobs’s first and second stints as Apple’s leader.","“Becoming Steve Jobs,” a new biography by Brent Schlender and Rick Tetzeli, focuses on the period between Mr. Jobs’s first and second stints as Apple’s leader.","The main point of the new business-oriented biography “Becoming Steve Jobs,” by Brent Schlender and Rick Tetzeli is that Steve Jobs has been misrepresented. Blame Walter Isaacson’s “Steve Jobs” (2011), as the authors do, for the public perception that Mr. Jobs never outgrew the managerial style of the scheming, screaming, cheating, smelly hothead he may — may — have been in his early years. Instead, Mr. Schlender and Mr. Tetzeli say in their new book, Mr. Jobs developed a wise, mature, deliberate executive style for which he is seldom given credit, one that helped lead Apple to glorious heights.",Review: ‘Becoming Steve Jobs’ Focuses on Another Apple Era,article,Books,Review,8,https://www.nytimes.com/2015/03/26/books/review-becoming-steve-jobs-focuses-on-another-apple-era.html
10,2015-03-24T09:30:28+0000,"Brent Schlender and Rick Tetzeli, who wrote “Becoming Steve Jobs,” discuss the evolution of Steve Jobs and how they won Apple’s cooperation for the unauthorized biography of its former chief executive.","Brent Schlender and Rick Tetzeli, who wrote “Becoming Steve Jobs,” discuss the evolution of Steve Jobs and how they won Apple’s cooperation for the unauthorized biography of its former chief executive.","Apple co-founder Steve Jobs has been the subject of myriad books. But the latest one, “Becoming Steve Jobs,” released on Tuesday, is the only one with Apple’s stamp of approval.",A Conversation With the Authors of ‘Becoming Steve Jobs’,article,Technology,News,7,https://bits.blogs.nytimes.com/2015/03/24/a-conversation-with-the-authors-of-becoming-steve-jobs/
11,2015-03-17T15:33:41+0000,"The service, which could be announced later this year, would offer a bundle of channels that is smaller and cheaper than a typical cable subscription, according to people briefed on the plans.","The service, which could be announced later this year, would offer a bundle of channels that is smaller and cheaper than a typical cable subscription, according to people briefed on the plans.","Apple has held talks with a number of leading television groups to offer an Internet-based TV service for its iPhone, iPad and Apple TV set-top box, according to people briefed on the company’s plans.","Apple Said to Plan Limited, Low-Cost Streaming Service",article,Business Day,News,1,https://www.nytimes.com/2015/03/18/business/media/apple-said-to-plan-limited-low-cost-streaming-service.html
12,2015-03-02T17:55:05+0000,The Internal Revenue Service is not emailing you to request personal information. Not now. Not ever.,The Internal Revenue Service is not emailing you to request personal information. Not now. Not ever.,,The Tax Con Cometh,article,Technology,Question,4,https://www.nytimes.com/2015/03/05/technology/personaltech/the-tax-scam-cometh.html
13,2015-03-17T07:20:12+0000,Genius alone didn’t bring Apple back. It took management chops.,Genius alone didn’t bring Apple back. It took management chops.,"The relationship between journalists and Steve Jobs could often be fraught, but there were always a handful of reporters he liked and trusted. They included John Markoff of The New York Times; Steven Levy, formerly of Wired magazine (he’s now at Medium); Walt Mossberg, the longtime technology columnist for The Wall Street Journal (he’s now at Re/code); and Brent Schlender of Fortune. They had all been on the technology beat seemingly forever, and they had known Jobs for decades.",The Hidden Talent of Steve Jobs,article,Opinion,Op-Ed,4,https://www.nytimes.com/2015/03/17/opinion/joe-nocera-the-hidden-talent-of-steve-jobs.html
14,2015-03-09T17:34:00+0000,"The service will cost $14.99 a month, and will include current and past programming, as well as movies.","The service will cost $14.99 a month, and will include current and past programming, as well as movies.","HBO has linked with Apple for the start of its much-anticipated Internet streaming service, uniting two premium brands from the media and technology worlds in a quest to reinvent the way people watch television.","HBO’s Streaming Service Will Start in April, Initially on Apple Devices Only",article,Business Day,News,4,https://www.nytimes.com/2015/03/10/business/media/hbo-streaming-to-start-in-april-on-apple-devices-only.html


In [None]:
df.iloc[176, 1:3].abstract


'App Pins aims to do for smartphone apps what Pinterest’s service has done for photos, recipes and many other types of websites. In short, the service is a type of digital corkboard that lets visitors save, or “pin,” items they like or places they want to go.'

In [None]:
df.iloc[176, 1:3].snippet

'App Pins aims to do for smartphone apps what Pinterest’s service has done for photos, recipes and many other types of websites. In short, the service is a type of digital corkboard that lets visitors save, or “pin,” items they like or places they ...'

In [None]:
#check for empty strings
(df == "").sum()

pub_date             0
abstract             0
snippet              3
lead_para           19
headline             0
doc_type             0
section_name         0
type_of_material     0
rank                 0
web_url              0
dtype: int64

In [None]:
df[df.snippet == ""]

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
1568,2021-02-16T21:27:40+0000,A North Dakota bill that an Apple executive had warned “threatens to destroy iPhone as you know it” died in a vote on Tuesday.,,A North Dakota bill that an Apple executive had warned “threatens to destroy iPhone as you know it” died in a vote on Tuesday.,North Dakota lawmakers vote down a bill that threatened Apple’s and Google’s revenues.,article,Business Day,News,2,https://www.nytimes.com/2021/02/16/business/north-dakota-app-store-bill.html
1601,2021-05-03T11:31:09+0000,"Apple and Epic Games, maker of the wildly popular game Fortnite, are set to square off on Monday in a trial that could decide how much control Apple can exert over the app economy. The trial is scheduled to open with testimony from Tim Sweeney, the chief of Epic, on why he believes Apple is a monopoly abusing its power.",,"Apple and Epic Games, maker of the wildly popular game Fortnite, are set to square off on Monday in a trial that could decide how much control Apple can exert over the app economy. The trial is scheduled to open with testimony from Tim Sweeney, the chief of Epic, on why he believes Apple is a monopoly abusing its power.",Apple and Epic Games head to court over app revenue.,article,Business Day,News,5,https://www.nytimes.com/2021/05/03/business/apple-and-epic-games-head-to-court-over-app-revenue.html
1617,2021-05-27T12:21:50+0000,"When Apple and Google collaborated last year on a smartphone-based system to track the spread of the coronavirus, the news was seen as a game changer. The software uses Bluetooth signals to detect app users who come into close contact. If a user later tests positive, the person can anonymously notify other app users whom the person may have crossed paths with in restaurants, on trains or elsewhere.",,"When Apple and Google collaborated last year on a smartphone-based system to track the spread of the coronavirus, the news was seen as a game changer. The software uses Bluetooth signals to detect app users who come into close contact. If a user later tests positive, the person can anonymously notify other app users whom the person may have crossed paths with in restaurants, on trains or elsewhere.",Virus alert apps powered by Apple and Google have had limited success.,article,Business Day,News,5,https://www.nytimes.com/2021/05/27/business/virus-alert-apps-powered-by-apple-and-google-have-had-limited-success.html


In [None]:
# 19 lead_paras blank
df[df.lead_para == ""] 

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
8,2015-01-02T17:09:23+0000,"You can adjust your settings so that only certain people can see your images. Also, syncing files between Macs and PCs.","You can adjust your settings so that only certain people can see your images. Also, syncing files between Macs and PCs.",,Sharing Flickr Photos Privately,article,Technology,Question,5,https://www.nytimes.com/2015/01/02/technology/personaltech/sharing-flickr-photos-privately.html
26,2015-02-27T15:06:31+0000,"Learn about the differences between Amazon Fire TV and Fire TV stick (besides price), and also discover how to transfer airline boarding passes between iPhones.","Learn about the differences between Amazon Fire TV and Fire TV stick (besides price), and also discover how to transfer airline boarding passes between iPhones.",,Choosing Between Amazon’s TV Streaming Hardware,article,Technology,Question,4,https://www.nytimes.com/2015/02/27/technology/personaltech/choosing-between-amazons-tv-streaming-hardware.html
29,2015-02-11T14:35:37+0000,"Tips on how to transfer photographs into iPhoto on your Mac, and advice for connecting to a virtual private network from a mobile device.","Tips on how to transfer photographs into iPhoto on your Mac, and advice for connecting to a virtual private network from a mobile device.",,Importing Images to iPhoto,article,Technology,Question,12,https://www.nytimes.com/2015/02/12/technology/personaltech/importing-images-to-iphoto.html
67,2015-03-02T17:55:05+0000,The Internal Revenue Service is not emailing you to request personal information. Not now. Not ever.,The Internal Revenue Service is not emailing you to request personal information. Not now. Not ever.,,The Tax Con Cometh,article,Technology,Question,4,https://www.nytimes.com/2015/03/05/technology/personaltech/the-tax-scam-cometh.html
100,2015-04-17T14:07:02+0000,"Apps will allow users to peek at their balance and recent activity, but not to pay bills or make deposits.","Apps will allow users to peek at their balance and recent activity, but not to pay bills or make deposits.",,"Banking on Apple Watch Will Be Limited, for Now",article,Your Money,News,3,https://www.nytimes.com/2015/04/17/your-money/banking-on-apple-watch-will-be-limited-for-now.html
108,2015-04-03T13:42:14+0000,The inexpensive device links a standard audio cable from the music player to a computer’s USB jack.,The inexpensive device links a standard audio cable from the music player to a computer’s USB jack.,,Converting Audio From a Cassette to MP3 Requires an Adapter,article,Technology,Question,3,https://www.nytimes.com/2015/04/03/technology/personaltech/converting-audio-from-a-cassette-to-mp3-requires-an-adapter.html
116,2015-05-22T13:14:57+0000,You can send the message to someone not in the original conversation by taking a couple of steps.,You can send the message to someone not in the original conversation by taking a couple of steps.,,Forwarding Text Messages on the iPhone,article,Technology,Question,6,https://www.nytimes.com/2015/05/23/technology/personaltech/23askkextra.html
153,2015-06-12T14:07:31+0000,"Tips for mirroring your phone’s display on a Mac, and how to get desktop notifications from Gmail.","Tips for mirroring your phone’s display on a Mac, and how to get desktop notifications from Gmail.",,Recording a Phone’s Screen on a Mac,article,Technology,Question,1,https://www.nytimes.com/2015/06/13/technology/personaltech/recording-a-phones-screen-on-a-mac.html
167,2015-06-10T15:45:58+0000,Some independent booksellers gain a portion of e-book sales through partnerships with bigger e-bookstores like Kobo and Amazon Source.,Some independent booksellers gain a portion of e-book sales through partnerships with bigger e-bookstores like Kobo and Amazon Source.,,"Shop Locally, Download Globally",article,Technology,Question,4,https://www.nytimes.com/2015/06/11/technology/personaltech/shop-locally-download-globally.html
188,2015-07-15T17:56:34+0000,"How to rid the bookmarks menu of a utility you don’t use. Plus, browsing with Chrome on Windows 10 and learning MacBook trackpad gestures.","How to rid the bookmarks menu of a utility you don’t use. Plus, browsing with Chrome on Windows 10 and learning MacBook trackpad gestures.",,Picking Pockets Off the Firefox Bookmarks List,article,Technology,Question,7,https://www.nytimes.com/2015/07/16/technology/personaltech/picking-pockets-off-the-firefox-bookmarks-list.html


In [None]:
# check for duplicates 
df.duplicated().sum()
df[['headline']].duplicated().sum() # 6 duplicates, but on different days, due to regular published column
df[df[['headline']].duplicated()]

Unnamed: 0,pub_date,abstract,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url,NLTK_sentiment,sentiment_label,vader_sent,headline_label,headline_score,abstract_label,abstract_score,transf_sent
227,2015-04-24,"Get recommendations from New York Times reporters and editors, highlighting interesting stories from around the web. In this installment, great reads from John Branch, Dan Levin and others.","Get recommendations from New York Times reporters and editors, highlighting great stories from around the web. What We’re Reading emails are sent twice a week. Sign up »",What We’re Reading,article,Blogs,News,21,https://news.blogs.nytimes.com/2015/04/24/what-were-reading-49/,0.38915,Positive,0.38915,POSITIVE,0.997283,POSITIVE,0.999783,POSITIVE
251,2015-05-08,"Get recommendations from New York Times reporters and editors, highlighting interesting stories from around the web. In this installment, great reads from John Branch, Lynda Richardson and others.","Get recommendations from New York Times reporters and editors, highlighting great stories from around the web. What We’re Reading emails are sent twice a week. Sign up »",What We’re Reading,article,Blogs,News,15,https://news.blogs.nytimes.com/2015/05/08/what-were-reading-53/,0.38915,Positive,0.38915,POSITIVE,0.997283,POSITIVE,0.999802,POSITIVE
289,2015-04-14,"Get recommendations from New York Times reporters and editors, highlighting interesting stories from around the web. In this installment, great reads from Stacy Cowley, Quentin Hardy and others.","Get recommendations from New York Times reporters and editors, highlighting great stories from around the web. What We’re Reading emails are sent twice a week. Sign up »",What We’re Reading,article,Blogs,News,13,https://news.blogs.nytimes.com/2015/04/14/what-were-reading-46/,0.42955,Positive,0.42955,POSITIVE,0.997283,POSITIVE,0.999807,POSITIVE
572,2016-02-19,"Get recommendations from New York Times reporters and editors, highlig","Get recommendations from New York Times reporters and editors, highlighting great stories from around the web. What We’re Reading emails are sent twice a week. Sign up »",What We’re Reading,article,Blogs,News,19,https://news.blogs.nytimes.com/2016/02/19/what-were-reading-129/,0.0,Neutral,0.0,POSITIVE,0.997283,POSITIVE,0.889406,POSITIVE
939,2017-07-29,The world’s most valuable company appears to have pulled down the apps amid China’s deepening crackdown on tools that evade internet controls.,HONG KONG — China appears to have received help on Saturday from an unlikely source in its fight against tools that help users evade its Great Firewall of internet censorship: Apple.,Apple Removes Apps From China Store That Help Internet Users Evade Censorship,article,Technology,News,2,https://www.nytimes.com/2017/07/29/technology/china-apple-censorhip.html,0.46375,Positive,0.46375,NEGATIVE,0.995543,NEGATIVE,0.998216,NEGATIVE
1817,2022-05-11,Here’s what you need to know at the end of the day.,(Want to get this newsletter in your inbox? Here’s the sign-up.),Your Wednesday Evening Briefing,article,Briefing,News,5,https://www.nytimes.com/2022/05/11/briefing/roe-vote-consumer-prices-ipods.html,0.0,Neutral,0.0,POSITIVE,0.949314,NEGATIVE,0.939224,POSITIVE


In [23]:
# Check headlines
pd.set_option('display.max_colwidth', None)
print(df['headline']) 

0                                     Apple’s Near-Death Experience Saved It
1                                          Use the iPhone’s iOS 8 Like a Pro
2                       Apple Watch Displays Your Digital World, at a Glance
3                                       Troubleshooting Android in Safe Mode
4                               Suddenly, Plenty of Options for Cord Cutters
                                        ...                                 
2138    On These Apps, the Dark Promise of Mothers Sexually Abusing Children
2139                                The Apple of One Business Reporter’s Eye
2140             How to Clean Up Your Phone’s Photo Library to Free Up Space
2141                                  Your Driving App Is Leading You Astray
2142        Apple Settles E.U. Case by Opening Its Payment Service to Rivals
Name: headline, Length: 2143, dtype: object


In [25]:
# Check lead paras
df['lead_para'].head(10)

0                                                                                                                                                                                                                                                                                                                                                                                                                                                 James B. Stewart, on CNBC, discusses why Apple became twice as big as Microsoft. One reason: Apple’s willingness to “destroy” its own products in the interest of innovation.
1                                                                                                                                                                                                                                                                                                                                                                                                       

In [None]:
# Check abstract
df['abstract'].head(10)

0    James B. Stewart, on CNBC, discusses why Apple became twice as big as Microsoft. One reason: Apple’s willingness to “destroy” its own products in the interest of innovation.
1                           You probably didn’t know about these five features hidden in the iPhone’s iOS 8 operating system. Use them, and you’ll be using your phone like a pro.
2                      The Apple Watch is good looking for a smartwatch and not hard to use, according to initial impressions. But who knows if it is really a necessary purchase.
3                         Third-party apps and recent operating system updates are often the cause; starting the phone in Safe Mode can help narrow down the search for a culprit.
4                                               Here are some prominent services offering live TV, à la carte networks and other on-demand streaming options in the United States.
5                                                                                                        

#### Data Cleaning

1. Reformat date to datetime
2. Remove snippet
3. Remove rows with null/empty string values

In [194]:
import numpy as np 

# Reformat the date
df['pub_date'] = pd.to_datetime(df['pub_date']) 
df['pub_date'] = df['pub_date'].dt.date

# Remove snippet
df.drop(labels='snippet', inplace=True, axis=1)

# Remove null/empty string rows
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,pub_date,abstract,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
0,2015-01-30,"James B. Stewart, on CNBC, discusses why Apple became twice as big as Microsoft. One reason: Apple’s willingness to “destroy” its own products in the interest of innovation.","James B. Stewart, on CNBC, discusses why Apple became twice as big as Microsoft. One reason: Apple’s willingness to “destroy” its own products in the interest of innovation.",Apple’s Near-Death Experience Saved It,multimedia,Business Day,Video,1,https://www.nytimes.com/video/business/100000003482723/apples-near-death-experience-saved-it.html
1,2015-01-14,"You probably didn’t know about these five features hidden in the iPhone’s iOS 8 operating system. Use them, and you’ll be using your phone like a pro.","You probably didn’t know about these five features hidden in the iPhone’s iOS 8 operating system. Use them, and you’ll be using your phone like a pro.",Use the iPhone’s iOS 8 Like a Pro,multimedia,Technology,Video,5,https://www.nytimes.com/video/technology/personaltech/100000003448481/use-the-iphones-ios8-like-a-pro.html
2,2015-03-10,"The Apple Watch is good looking for a smartwatch and not hard to use, according to initial impressions. But who knows if it is really a necessary purchase.","SAN FRANCISCO — When Apple unveiled its watch last fall, the company showed only demo models of the new device — polished prototypes of the hardware running nonworking loops of the software.","Apple Watch Displays Your Digital World, at a Glance",article,Technology,News,5,https://www.nytimes.com/2015/03/10/technology/personaltech/apple-watch-displays-your-digital-world-at-a-glance.html
3,2015-03-06,Third-party apps and recent operating system updates are often the cause; starting the phone in Safe Mode can help narrow down the search for a culprit.,Q. My Android phone just started freezing and crashing a few days ago. What would cause it to do so?,Troubleshooting Android in Safe Mode,article,Technology,Question,3,https://www.nytimes.com/2015/03/06/technology/personaltech/troubleshooting-android-in-safe-mode.html
4,2015-03-18,"Here are some prominent services offering live TV, à la carte networks and other on-demand streaming options in the United States.","Here are some prominent services offering live TV, à la carte networks and other on-demand streaming options in the United States.","Suddenly, Plenty of Options for Cord Cutters",multimedia,Business Day,Interactive Feature,3,https://www.nytimes.com/interactive/2015/business/media/streaming-tv-cord-cutting-guide.html


In [None]:
# save data to csv
file_name = 'Apple_Inc_text_data'
df.to_csv('../../data/cleaned/{}'.format(file_name), index=False)

# check = pd.read_csv('../../data/cleaned/{}'.format(file_name))
# check

### Misc Testing Code [UNUSED]

##### Processing data (testing 1 file)

In [None]:
# Read data from raw files
# testing with one file

import json

# Load content from json
with open('../../data/raw/2015/organizations_Apple_Inc_mth01_pg0', 'r') as file:
    data = json.load(file)

# Print data
print(len(data))

3


In [3]:
data["response"]['docs'][0]

{'abstract': 'It’s time to take note of what lies at the top of the fashion agenda for 2015.',
 'web_url': 'https://runway.blogs.nytimes.com/2015/01/05/what-to-watch-for-in-2015-galliano-and-gucci-elections-and-apple/',
 'snippet': 'It’s time to take note of what lies at the top of the fashion agenda for 2015.',
 'lead_paragraph': 'It is officially the first full week of the new year, and you know what that means: time to take note of what to watch for in 2015. I think it’s going to be a doozy. Why? Well, here’s what I have at the top of my agenda:',
 'source': 'The New York Times',
 'multimedia': [{'rank': 0,
   'subtype': 'wide',
   'caption': None,
   'credit': None,
   'type': 'image',
   'url': 'images/2015/01/05/fashion/05runway/05runway-thumbWide.jpg',
   'height': 126,
   'width': 190,
   'legacy': {'widewidth': 190,
    'wideheight': 126,
    'wide': 'images/2015/01/05/fashion/05runway/05runway-thumbWide.jpg'},
   'subType': 'wide',
   'crop_name': 'thumbWide'},
  {'rank': 0,


In [8]:
# check length 
len(data['response']['docs'])

10

In [9]:
# Need to extract abstract, web_url, snippet, lead_paragraph, pub_date, document_type, section_name, type_of_material, rank (?)
article = data['response']['docs'][0]

# Testing retrieval of components
pub_date = article['pub_date']

abstract = article['abstract']
snippet = article['snippet']
lead_para = article['lead_paragraph']
headline = article['headline']['main']

doc_type = article['document_type']
section_name = article['section_name']
type_of_material = article['type_of_material']

# Find the rank of Apple Inc keyword in article: gauge of relevance 
keywords = article['keywords']
rank = next((item['rank'] for item in keywords if item['name'] == 'organizations' and item['value'] == 'Apple Inc'), None) # next retrieves first matching rank, may not be necessary

web_url = article['web_url']

In [10]:
pub_date

'2015-01-05T14:03:03+0000'

##### Renaming files

In [None]:

import os
from pathlib import Path

base_dir = Path('../../data/raw')

# Loop through years
for year in range(2015, 2025):
    year_dir = base_dir / str(year)  # Construct path for each year

    # Traverse all files in the year directory
    for file in year_dir.rglob("*"):  # finds all files and subdirectories
        if file.is_file() and " " in file.name:
            new_name = file.name.replace(" ", "_")  
            new_path = file.with_name(new_name)  # Create new path
            file.rename(new_path)  # Rename the file
            print(f"Renamed: {file} -> {new_path}")  # Optional log output

Renamed: ../../data/raw/2015/organizations_Apple Inc_mth09_pg1 -> ../../data/raw/2015/organizations_Apple_Inc_mth09_pg1
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth08_pg1 -> ../../data/raw/2015/organizations_Apple_Inc_mth08_pg1
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth09_pg0 -> ../../data/raw/2015/organizations_Apple_Inc_mth09_pg0
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth08_pg0 -> ../../data/raw/2015/organizations_Apple_Inc_mth08_pg0
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth01_pg2 -> ../../data/raw/2015/organizations_Apple_Inc_mth01_pg2
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth03_pg3 -> ../../data/raw/2015/organizations_Apple_Inc_mth03_pg3
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth03_pg2 -> ../../data/raw/2015/organizations_Apple_Inc_mth03_pg2
Renamed: ../../data/raw/2015/organizations_Apple Inc_mth04_pg3 -> ../../data/raw/2015/organizations_Apple_Inc_mth04_pg3
Renamed: ../../data/raw/2015/organizatio