### Data Processing

#### Setup

In [3]:
import pandas as pd
import json
import os
from pathlib import Path
import numpy as np 

#### Reading JSON data into df

In [32]:
# Iterate through all articles in document
base_dir = Path('../../data/raw')
company = 'Apple_Inc'

def read_file(path):
    # Load content from json
    with open(path, 'r') as file:
        all_data = json.load(file)

    # Select response
    data = all_data['response']['docs']

    return data
   

def read_articles(data):

    column_list = ['pub_date','abstract','snippet','lead_para','headline','doc_type','section_name','type_of_material','rank','web_url']
    df = pd.DataFrame(columns=column_list)

    for article in data:
        # print(article)
        # Retrieve components
        pub_date = article['pub_date']

        abstract = article['abstract']
        snippet = article['snippet']
        lead_para = article['lead_paragraph']
        headline = article['headline']['main']

        doc_type = article['document_type']
        section_name = article['section_name']
        type_of_material = article.get('type_of_material', None) # will throw an error 

        keywords = article['keywords']
        rank = next((item['rank'] for item in keywords if item['name'] == 'organizations' and item['value'] == 'Apple Inc'), None) # next retrieves first matching rank, may not be necessary

        web_url = article['web_url']

        # Assign to new row in df
        df.loc[len(df)] = [pub_date,
                           abstract,
                           snippet,
                           lead_para,
                           headline,
                           doc_type,
                           section_name,
                           type_of_material,
                           rank,
                           web_url]
    
    return df


# MAIN function below
def compile_company_df(base_dir, company):
    
    column_list = ['pub_date','abstract','snippet','lead_para','headline','doc_type','section_name','type_of_material','rank','web_url']
    df = pd.DataFrame(columns=column_list)

    # Run through directory files
    for year in range(2015, 2025):
        year_dir = base_dir / str(year) 

        for file in year_dir.rglob("*"):
            if file.is_file() and company in file.name and 'fulltext' not in file.name: # modify this if fulltext check not req
                data = read_file(year_dir / file.name)
                temp_df = read_articles(data)
                df = pd.concat([df, temp_df], ignore_index=True)
    
    return df   

df = compile_company_df(base_dir, company)
# read_articles(data)
# read_file('../../data/raw/2015/organizations_Apple_Inc_mth01_pg0.json')
# data

#### Checking df

Findings:
<br><br>
(1) Use abstract instead of snippet
* Snippet and abstract are exactly the same in all except 6 cases + abstract is more complete
* Snippet has 3 "" (empty string) values
<br><br>

(2) Consider either abstract OR lead para 
* Abstract and lead para frequently have some overlaps
* Lead para is often longer/more detailed, but may not summarise the key point
<br><br>

(3) Null/empty string values
* Lead para has 19 "" (empty string) values
* Type of material has 1 null value
<br><br>

(4) No duplicates
* Only repeats in headline/abstract occasionally, published on diff dates (regularly published column)
<br><br>

(Using Apple Inc as an example)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2143 entries, 0 to 2142
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pub_date          2143 non-null   object
 1   abstract          2143 non-null   object
 2   snippet           2143 non-null   object
 3   lead_para         2143 non-null   object
 4   headline          2143 non-null   object
 5   doc_type          2143 non-null   object
 6   section_name      2143 non-null   object
 7   type_of_material  2142 non-null   object
 8   rank              2143 non-null   object
 9   web_url           2143 non-null   object
dtypes: object(10)
memory usage: 167.6+ KB


In [17]:
# Check if abstract == snippet
df[df.abstract != df.snippet].shape

(6, 10)

In [18]:
# Check if lead para == abstract
df[df.abstract != df.lead_para].shape
df[df.abstract != df.lead_para].head(10)


Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
0,2015-04-07T09:17:59+0000,"Want to work at Amazon, Apple or McKinsey? Som...","Want to work at Amazon, Apple or McKinsey? Som...","With some 13,000 graduate schools of business ...",M.B.A. Programs That Get You Where You Want to Go,article,Education,News,7,https://www.nytimes.com/2015/04/12/education/e...
1,2015-04-14T20:46:01+0000,Get recommendations from New York Times report...,Get recommendations from New York Times report...,Get recommendations from New York Times report...,What We’re Reading,article,Blogs,News,13,https://news.blogs.nytimes.com/2015/04/14/what...
2,2015-04-13T22:00:31+0000,The business unit will partner with companies ...,The business unit will partner with companies ...,IBM is taking its Watson artificial-intelligen...,IBM Creates Watson Health to Analyze Medical Data,article,Technology,News,8,https://bits.blogs.nytimes.com/2015/04/13/ibm-...
3,2015-04-22T19:42:26+0000,"With superstars first in line, Apple appears t...","With superstars first in line, Apple appears t...","Two weeks ago, Pharrell Williams posted an Ins...",What’s That on Beyoncé’s Wrist? Let Me Guess ....,article,Style,News,1,https://www.nytimes.com/2015/04/23/style/whats...
4,2015-04-01T10:21:53+0000,"In an industry that avoids controversy, the he...","In an industry that avoids controversy, the he...",The technology industry’s leaders have found t...,Daily Report: Tech Leaders Come Together to Op...,article,Technology,News,3,https://bits.blogs.nytimes.com/2015/04/01/dail...
5,2015-04-08T12:01:17+0000,"Spending seven days learning to use, and getti...","Spending seven days learning to use, and getti...",I picked up my Apple Watch last Wednesday at A...,Dear Diary: My Week Wearing an Apple Watch,article,Technology,News,1,https://www.nytimes.com/2015/04/09/technology/...
6,2015-04-07T13:32:19+0000,The iTunes software can automatically sync wit...,The iTunes software can automatically sync wit...,ITunes Problems,Diagnosing a Syncing Problem With iTunes,article,Technology,Question,5,https://www.nytimes.com/2015/04/09/technology/...
7,2015-04-14T00:51:55+0000,"Laurence D. Fink, chief of BlackRock, the worl...","Laurence D. Fink, chief of BlackRock, the worl...","On Tuesday morning, the chief executives of 50...","BlackRock’s Chief, Laurence Fink, Urges Other ...",article,Business Day,News,7,https://www.nytimes.com/2015/04/14/business/de...
8,2015-04-08T12:01:04+0000,When apps are streamlined and first-generation...,When apps are streamlined and first-generation...,"It took three days — three long, often confusi...","Apple Watch Review: Bliss, but Only After a St...",article,Technology,News,2,https://www.nytimes.com/2015/04/09/technology/...
9,2015-04-23T17:42:36+0000,Tips for Time Warner Cable customers who want ...,Tips for Time Warner Cable customers who want ...,Q. I want to be able to set programs to record...,Remotely Controlling the DVR,article,Technology,Question,6,https://www.nytimes.com/2015/04/24/technology/...


In [19]:
df.iloc[176, 1:3].abstract


'We’re not creating the new businesses we should be, and these giants have to be broken up.'

In [20]:
df.iloc[176, 1:3].snippet

'We’re not creating the new businesses we should be, and these giants have to be broken up.'

In [21]:
#check for empty strings
(df == "").sum()

pub_date             0
abstract             0
snippet              3
lead_para           19
headline             0
doc_type             0
section_name         0
type_of_material     0
rank                 0
web_url              0
dtype: int64

In [22]:
df[df.snippet == ""]

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
1575,2021-05-27T12:21:50+0000,When Apple and Google collaborated last year o...,,When Apple and Google collaborated last year o...,Virus alert apps powered by Apple and Google h...,article,Business Day,News,5,https://www.nytimes.com/2021/05/27/business/vi...
1598,2021-05-03T11:31:09+0000,"Apple and Epic Games, maker of the wildly popu...",,"Apple and Epic Games, maker of the wildly popu...",Apple and Epic Games head to court over app re...,article,Business Day,News,5,https://www.nytimes.com/2021/05/03/business/ap...
1641,2021-02-16T21:27:40+0000,A North Dakota bill that an Apple executive ha...,,A North Dakota bill that an Apple executive ha...,North Dakota lawmakers vote down a bill that t...,article,Business Day,News,2,https://www.nytimes.com/2021/02/16/business/no...


In [23]:
# 19 lead_paras blank
df[df.lead_para == ""] 

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
55,2015-05-22T13:14:57+0000,You can send the message to someone not in the...,You can send the message to someone not in the...,,Forwarding Text Messages on the iPhone,article,Technology,Question,6,https://www.nytimes.com/2015/05/23/technology/...
69,2015-06-10T15:45:58+0000,Some independent booksellers gain a portion of...,Some independent booksellers gain a portion of...,,"Shop Locally, Download Globally",article,Technology,Question,4,https://www.nytimes.com/2015/06/11/technology/...
90,2015-08-26T16:38:32+0000,Microsoft collects personal data on users of W...,Microsoft collects personal data on users of W...,,Staying Private in Windows 10,article,Technology,Question,6,https://www.nytimes.com/2015/08/27/technology/...
106,2015-02-27T15:06:31+0000,Learn about the differences between Amazon Fir...,Learn about the differences between Amazon Fir...,,Choosing Between Amazon’s TV Streaming Hardware,article,Technology,Question,4,https://www.nytimes.com/2015/02/27/technology/...
109,2015-02-11T14:35:37+0000,Tips on how to transfer photographs into iPhot...,Tips on how to transfer photographs into iPhot...,,Importing Images to iPhoto,article,Technology,Question,12,https://www.nytimes.com/2015/02/12/technology/...
114,2015-10-14T20:42:04+0000,Upgrading from an older version of Windows can...,Upgrading from an older version of Windows can...,,Restoring Sound to Windows 10,article,Technology,Question,7,https://www.nytimes.com/2015/10/15/technology/...
149,2015-09-16T13:32:10+0000,If your computer did not come with a microphon...,If your computer did not come with a microphon...,,How to Talk to Microsoft’s Cortana,article,Technology,Question,5,https://www.nytimes.com/2015/09/17/technology/...
157,2015-08-07T13:36:02+0000,Recovering discarded photos before they are go...,Recovering discarded photos before they are go...,,Digging for Trashed Pictures in Mac’s Photos App,article,Technology,Question,1,https://www.nytimes.com/2015/08/08/technology/...
184,2015-09-04T15:22:20+0000,How to get your Windows 10 operating system to...,How to get your Windows 10 operating system to...,,Coaxing a Printer to Work With Windows 10,article,Technology,Question,2,https://www.nytimes.com/2015/09/05/technology/...
194,2015-07-15T17:56:34+0000,How to rid the bookmarks menu of a utility you...,How to rid the bookmarks menu of a utility you...,,Picking Pockets Off the Firefox Bookmarks List,article,Technology,Question,7,https://www.nytimes.com/2015/07/16/technology/...


In [24]:
# check for duplicates 
df.duplicated().sum()
df[['headline']].duplicated().sum() # 6 duplicates, but on different days, due to regular published column
df[df[['headline']].duplicated()]

Unnamed: 0,pub_date,abstract,snippet,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
13,2015-05-08T20:58:24+0000,Get recommendations from New York Times report...,Get recommendations from New York Times report...,Get recommendations from New York Times report...,What We’re Reading,article,Blogs,News,15,https://news.blogs.nytimes.com/2015/05/08/what...
50,2015-04-24T21:04:40+0000,Get recommendations from New York Times report...,Get recommendations from New York Times report...,Get recommendations from New York Times report...,What We’re Reading,article,Blogs,News,21,https://news.blogs.nytimes.com/2015/04/24/what...
282,2015-04-07T21:14:06+0000,Get recommendations from New York Times report...,Get recommendations from New York Times report...,Get recommendations from New York Times report...,What We’re Reading,article,Blogs,News,16,https://news.blogs.nytimes.com/2015/04/07/what...
367,2016-02-19T21:52:48+0000,Get recommendations from New York Times report...,Get recommendations from New York Times report...,Get recommendations from New York Times report...,What We’re Reading,article,Blogs,News,19,https://news.blogs.nytimes.com/2016/02/19/what...
924,2017-07-30T18:23:44+0000,The world’s most valuable company appears to h...,The world’s most valuable company appears to h...,HONG KONG — Software made by foreign companies...,Apple Removes Apps From China Store That Help ...,article,Technology,News,2,https://www.nytimes.com/2017/07/30/technology/...
1776,2022-09-07T21:55:35+0000,Here’s what you need to know at the end of the...,Here’s what you need to know at the end of the...,(Want to get this newsletter in your inbox? He...,Your Wednesday Evening Briefing,article,Briefing,News,6,https://www.nytimes.com/2022/09/07/briefing/vo...


In [25]:
# Check headlines
pd.set_option('display.max_colwidth', None)
print(df['headline']) 

0                         M.B.A. Programs That Get You Where You Want to Go
1                                                        What We’re Reading
2                         IBM Creates Watson Health to Analyze Medical Data
3           What’s That on Beyoncé’s Wrist? Let Me Guess ... an Apple Watch
4            Daily Report: Tech Leaders Come Together to Oppose Indiana Law
                                       ...                                 
2138           Apple Takes a Humble Approach to Launching Its Newest Device
2139    Apple Overhauls App Store in Europe, in Response to New Digital Law
2140                 The Apple Vision Pro Is a Marvel. But Who Will Buy It?
2141      U.S. Moves Closer to Filing Sweeping Antitrust Case Against Apple
2142                                      Charms Can Personalize Your Watch
Name: headline, Length: 2143, dtype: object


In [26]:
# Check lead paras
df['lead_para'].head(10)

0                                                                                                                                                                                                                                                                       With some 13,000 graduate schools of business across the globe, the M.B.A. degree has clearly become a commodity.
1                                                                                                                                                                                                               Get recommendations from New York Times reporters and editors, highlighting great stories from around the web. What We’re Reading emails are sent twice a week. Sign up »
2                                                                                                                                                                                                                       IBM is taking its Watson art

In [27]:
# Check abstract
df['abstract'].head(10)

0                                                      Want to work at Amazon, Apple or McKinsey? Some business schools have impressive records placing graduates in certain fields and even companies.
1    Get recommendations from New York Times reporters and editors, highlighting interesting stories from around the web. In this installment, great reads from Stacy Cowley, Quentin Hardy and others.
2    The business unit will partner with companies including Apple, Medtronic and Johnson & Johnson, offering IBM’s Watson technology as a cloud-based tool at many levels of the health care industry.
3                                                                     With superstars first in line, Apple appears to be seeking exclusivity by limiting where its watches are sold and who wears them.
4                                                   In an industry that avoids controversy, the heads of several prominent companies, including Apple and Salesforce.com, have chosen to pick a battle.


#### Data Cleaning

1. Reformat date to datetime
2. Remove snippet
3. Remove rows with null/empty string values

In [33]:
# Reformat the date
df['pub_date'] = pd.to_datetime(df['pub_date']) 
df['pub_date'] = df['pub_date'].dt.date

# Remove snippet
df.drop(labels='snippet', inplace=True, axis=1)

# Remove null/empty string rows
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,pub_date,abstract,lead_para,headline,doc_type,section_name,type_of_material,rank,web_url
0,2015-04-07,"Want to work at Amazon, Apple or McKinsey? Some business schools have impressive records placing graduates in certain fields and even companies.","With some 13,000 graduate schools of business across the globe, the M.B.A. degree has clearly become a commodity.",M.B.A. Programs That Get You Where You Want to Go,article,Education,News,7,https://www.nytimes.com/2015/04/12/education/edlife/mba-programs-that-get-you-where-you-want-to-go.html
1,2015-04-14,"Get recommendations from New York Times reporters and editors, highlighting interesting stories from around the web. In this installment, great reads from Stacy Cowley, Quentin Hardy and others.","Get recommendations from New York Times reporters and editors, highlighting great stories from around the web. What We’re Reading emails are sent twice a week. Sign up »",What We’re Reading,article,Blogs,News,13,https://news.blogs.nytimes.com/2015/04/14/what-were-reading-46/
2,2015-04-13,"The business unit will partner with companies including Apple, Medtronic and Johnson & Johnson, offering IBM’s Watson technology as a cloud-based tool at many levels of the health care industry.","IBM is taking its Watson artificial-intelligence technology into health care in a big way with industry partners, a pair of acquisitions and an ambitious agenda.",IBM Creates Watson Health to Analyze Medical Data,article,Technology,News,8,https://bits.blogs.nytimes.com/2015/04/13/ibm-creates-watson-health-to-analyze-medical-data/
3,2015-04-22,"With superstars first in line, Apple appears to be seeking exclusivity by limiting where its watches are sold and who wears them.","Two weeks ago, Pharrell Williams posted an Instagram video of his Apple Watch. The clip has more than 119,000 likes.",What’s That on Beyoncé’s Wrist? Let Me Guess ... an Apple Watch,article,Style,News,1,https://www.nytimes.com/2015/04/23/style/whats-that-on-beyonces-wrist-let-me-guess-an-apple-watch.html
4,2015-04-01,"In an industry that avoids controversy, the heads of several prominent companies, including Apple and Salesforce.com, have chosen to pick a battle.","The technology industry’s leaders have found their collective voice on a social issue in the last week, rallying with great intensity against a new Indiana law that will allow businesses, they predict, to discriminate against gay couples. The heads of Apple, Salesforce.com, Yelp and Square have all publicly criticized the law, as have some leaders from other industries.",Daily Report: Tech Leaders Come Together to Oppose Indiana Law,article,Technology,News,3,https://bits.blogs.nytimes.com/2015/04/01/daily-report-tech-leaders-come-together-to-oppose-indiana-law/


In [34]:
# Checking df info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2124 entries, 0 to 2142
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   pub_date          2124 non-null   object
 1   abstract          2124 non-null   object
 2   lead_para         2124 non-null   object
 3   headline          2124 non-null   object
 4   doc_type          2124 non-null   object
 5   section_name      2124 non-null   object
 6   type_of_material  2124 non-null   object
 7   rank              2124 non-null   object
 8   web_url           2124 non-null   object
dtypes: object(9)
memory usage: 165.9+ KB


In [36]:
# save data to csv
file_name = '{}_text_data'.format(company)
df.to_csv('../../data/cleaned/{}.csv'.format(file_name), index=False)

# check = pd.read_csv('../../data/cleaned/{}'.format(file_name))
# check