In [None]:
# Import packages and modules
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil import parser
import matplotlib.pyplot as plt
import seaborn as sns


from collections import Counter

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Read in the Federal Registry Document data csv file. (This file was generated and read out of "1-GetDocs&StockData.ipynb").
df_docs = pd.read_csv('fed_reg_docs_8yr.csv')
df_docs

Unnamed: 0,date,abstract,title,topics,agency_names,raw_text_url
0,2008-08-08,This final rule updates the payment rates used...,Medicare Program; Prospective Payment System a...,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...
1,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; La Crosse, WI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...
2,2008-08-08,The General Services Administration (GSA) is a...,General Services Administration Acquisition Re...,,General Services Administration,https://www.federalregister.gov/documents/full...
3,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Honolulu, HI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...
4,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Bangor, ME",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...
...,...,...,...,...,...,...
48569,2016-07-01,We are adopting a new airworthiness directive ...,Airworthiness Directives; Rolls-Royce Deutschl...,Air transportation; Aircraft; Aviation safety;...,Transportation Department; Federal Aviation Ad...,https://www.federalregister.gov/documents/full...
48570,2016-07-01,This action modifies Class E airspace extendin...,Amendment of Class E Airspace for the Followin...,Airspace; Navigation (air),Transportation Department; Federal Aviation Ad...,https://www.federalregister.gov/documents/full...
48571,2016-07-01,The Coast Guard will enforce five safety zones...,Safety Zones; Annual Firework Displays Within ...,,Homeland Security Department; Coast Guard,https://www.federalregister.gov/documents/full...
48572,2016-07-01,The Coast Guard is establishing a temporary sa...,"Safety Zone, Shallowbag Bay; Manteo, NC",Harbors; Marine safety; Navigation (water); Re...,Homeland Security Department; Coast Guard,https://www.federalregister.gov/documents/full...


In [None]:
# Set the date column to a datetime object
df_docs['date'] = pd.to_datetime(df_docs['date'])
# We see there are a bunch of nulls that need to be addressed
df_docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48574 entries, 0 to 48573
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          48574 non-null  datetime64[ns]
 1   abstract      47828 non-null  object        
 2   title         48574 non-null  object        
 3   topics        34379 non-null  object        
 4   agency_names  48549 non-null  object        
 5   raw_text_url  48574 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 2.2+ MB


In [None]:
# Set the date as the index
df_docs.set_index('date', inplace=True)
df_docs.head(1)

Unnamed: 0_level_0,abstract,title,topics,agency_names,raw_text_url
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-08-08,This final rule updates the payment rates used...,Medicare Program; Prospective Payment System a...,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...


In [None]:
# Take a look at the number of nulls in the df
df_docs.isnull().sum()

abstract          746
title               0
topics          14195
agency_names       25
raw_text_url        0
dtype: int64

In [None]:
# Since there are 746 nulls in the abstract column, I will combine the contents
# of the "abstract" and "title" columns into a single column named "synopsis".
df_docs['synopsis'] = df_docs['abstract'].astype(str) + df_docs['title'].astype(str)
df_docs.head()

Unnamed: 0_level_0,abstract,title,topics,agency_names,raw_text_url,synopsis
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-08-08,This final rule updates the payment rates used...,Medicare Program; Prospective Payment System a...,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...,This final rule updates the payment rates used...
2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; La Crosse, WI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...
2008-08-08,The General Services Administration (GSA) is a...,General Services Administration Acquisition Re...,,General Services Administration,https://www.federalregister.gov/documents/full...,The General Services Administration (GSA) is a...
2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Honolulu, HI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...
2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Bangor, ME",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...


In [None]:
# Look at a single synopsis
df_docs['synopsis'][0]

'This final rule updates the payment rates used under the prospective payment system (PPS) for skilled nursing facilities (SNFs), for fiscal year (FY) 2009. It also discusses our ongoing analysis of nursing home staff time measurement data collected in the Staff Time and Resource Intensity Verification (STRIVE) project. Finally, this final rule makes technical corrections in the regulations text with respect to Medicare bad debt payments to SNFs and the reference to the definition of urban and rural as applied to SNFs.Medicare Program; Prospective Payment System and Consolidated Billing for Skilled Nursing Facilities for FY 2009'

In [None]:
# This title was added to the synopsis cell
df_docs['title'][0]

'Medicare Program; Prospective Payment System and Consolidated Billing for Skilled Nursing Facilities for FY 2009'

In [None]:
# The synopsis column has no missing values
df_docs.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 48574 entries, 2008-08-08 to 2016-07-01
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   abstract      47828 non-null  object
 1   title         48574 non-null  object
 2   topics        34379 non-null  object
 3   agency_names  48549 non-null  object
 4   raw_text_url  48574 non-null  object
 5   synopsis      48574 non-null  object
dtypes: object(6)
memory usage: 3.8+ MB


In [None]:
# Reset the index to work with the date as a column not an index
df_docs.reset_index(inplace=True)
df_docs.head(1)

Unnamed: 0,index,date,abstract,title,topics,agency_names,raw_text_url,synopsis
0,0,2008-08-08,This final rule updates the payment rates used...,Medicare Program; Prospective Payment System a...,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...,This final rule updates the payment rates used...


In [None]:
# My goal is to group all the contents of the columns by date.
# I'll start with the count of rows with the same date.
df_count_dates = df_docs.groupby(['date']).size().reset_index(name='counts')
df_count_dates

Unnamed: 0,date,counts
0,2008-08-08,18
1,2008-08-11,21
2,2008-08-12,25
3,2008-08-13,28
4,2008-08-14,19
...,...,...
1969,2016-06-27,31
1970,2016-06-28,40
1971,2016-06-29,25
1972,2016-06-30,54


In [None]:
# Next, I want to group the synopsis column by date so all the synopsis for each date appear in the same cell.
# I use groupby date, then join the synopsis using apply, and reset the index
df_synopsis_grp = df_docs.groupby(['date'])['synopsis'].apply(','.join).reset_index()
df_synopsis_grp

Unnamed: 0,date,synopsis
0,2008-08-08,This final rule updates the payment rates used...
1,2008-08-11,nanGeneral Services Acquisition Regulation; GS...
2,2008-08-12,"We, the U.S. Fish and Wildlife Service (Servic..."
3,2008-08-13,The General Services Administration (GSA) is p...
4,2008-08-14,"We, the U.S. Fish and Wildlife Service (we or ..."
...,...,...
1969,2016-06-27,The Coast Guard announces a public meeting to ...
1970,2016-06-28,The Coast Guard proposes to establish a tempor...
1971,2016-06-29,"In January 2011, NMFS implemented the trawl ra..."
1972,2016-06-30,The Environmental Protection Agency (EPA) is p...


In [None]:
# There are 25 missing agency names, so I will fill them with "NoAgencies" prior to grouping them
df_docs["agency_names"].fillna("NoAgencies", inplace = True)
df_docs.head()

Unnamed: 0,index,date,abstract,title,topics,agency_names,raw_text_url,synopsis
0,0,2008-08-08,This final rule updates the payment rates used...,Medicare Program; Prospective Payment System a...,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...,This final rule updates the payment rates used...
1,1,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; La Crosse, WI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...
2,2,2008-08-08,The General Services Administration (GSA) is a...,General Services Administration Acquisition Re...,,General Services Administration,https://www.federalregister.gov/documents/full...,The General Services Administration (GSA) is a...
3,3,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Honolulu, HI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...
4,4,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Bangor, ME",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...


In [None]:
# Next, I want to group the agency_names column by date so all the agencies for each date appear in the same cell.
# I use groupby date, then join the agency names using apply, and reset the index
df_agency_grp = df_docs.groupby(['date'])['agency_names'].apply(','.join).reset_index()
df_agency_grp

Unnamed: 0,date,agency_names
0,2008-08-08,Health and Human Services Department; Centers ...
1,2008-08-11,"General Services Administration,Health and Hum..."
2,2008-08-12,Interior Department; Fish and Wildlife Service...
3,2008-08-13,"General Services Administration,Health and Hum..."
4,2008-08-14,Interior Department; Fish and Wildlife Service...
...,...,...
1969,2016-06-27,"Homeland Security Department; Coast Guard,Secu..."
1970,2016-06-28,"Homeland Security Department; Coast Guard,Inte..."
1971,2016-06-29,Commerce Department; National Oceanic and Atmo...
1972,2016-06-30,"Environmental Protection Agency,Environmental ..."


In [None]:
df_agency_grp['agency_names'] = df_agency_grp['agency_names'].str.replace(",", ";")

In [None]:
df_agency_grp['agency_names'][0]


'Health and Human Services Department; Centers for Medicare & Medicaid Services;Federal Communications Commission;General Services Administration;Federal Communications Commission;Federal Communications Commission;Interior Department; National Park Service;Interior Department; Surface Mining Reclamation and Enforcement Office;Nuclear Regulatory Commission;Health and Human Services Department; Centers for Medicare & Medicaid Services;Health and Human Services Department; Children and Families Administration;Homeland Security Department; Coast Guard;Homeland Security Department; Coast Guard;Homeland Security Department; Coast Guard;Homeland Security Department; Coast Guard;Federal Reserve System;Agriculture Department; Food and Nutrition Service;Health and Human Services Department; Centers for Medicare & Medicaid Services;Environmental Protection Agency'

In [None]:
df_agency_grp

Unnamed: 0,date,agency_names
0,2008-08-08,Health and Human Services Department; Centers ...
1,2008-08-11,General Services Administration;Health and Hum...
2,2008-08-12,Interior Department; Fish and Wildlife Service...
3,2008-08-13,General Services Administration;Health and Hum...
4,2008-08-14,Interior Department; Fish and Wildlife Service...
...,...,...
1969,2016-06-27,Homeland Security Department; Coast Guard;Secu...
1970,2016-06-28,Homeland Security Department; Coast Guard;Inte...
1971,2016-06-29,Commerce Department; National Oceanic and Atmo...
1972,2016-06-30,Environmental Protection Agency;Environmental ...


In [None]:
df_agency_grp['agency_names'][100]

'Homeland Security Department; Coast Guard;Agriculture Department; Rural Utilities Service;Interior Department; Fish and Wildlife Service;Commerce Department; Industry and Security Bureau;Agriculture Department; Food and Nutrition Service;Transportation Department; Surface Transportation Board;Agriculture Department; Rural Utilities Service;Homeland Security Department;Transportation Department; Federal Aviation Administration;Energy Department;Homeland Security Department; Federal Emergency Management Agency;Education Department'

In [None]:
# There are 14195 missing topics, so I will fill them with "NoTopics" prior to grouping them
df_docs["topics"].fillna("NoTopic", inplace = True)
df_docs.head()

Unnamed: 0,index,date,abstract,title,topics,agency_names,raw_text_url,synopsis
0,0,2008-08-08,This final rule updates the payment rates used...,Medicare Program; Prospective Payment System a...,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...,This final rule updates the payment rates used...
1,1,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; La Crosse, WI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...
2,2,2008-08-08,The General Services Administration (GSA) is a...,General Services Administration Acquisition Re...,NoTopic,General Services Administration,https://www.federalregister.gov/documents/full...,The General Services Administration (GSA) is a...
3,3,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Honolulu, HI",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...
4,4,2008-08-08,The Commission requests comments on a channel ...,"Television Broadcasting Services; Bangor, ME",Television,Federal Communications Commission,https://www.federalregister.gov/documents/full...,The Commission requests comments on a channel ...


In [None]:
df_docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48574 entries, 0 to 48573
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   index         48574 non-null  int64         
 1   date          48574 non-null  datetime64[ns]
 2   abstract      47828 non-null  object        
 3   title         48574 non-null  object        
 4   topics        48574 non-null  object        
 5   agency_names  48574 non-null  object        
 6   raw_text_url  48574 non-null  object        
 7   synopsis      48574 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 3.0+ MB


In [None]:
# Next, I want to group the topics column by date so all the topics for each date appear in the same cell.
# I use groupby date, then join the topics using apply, and reset the index
df_topics_grp = df_docs.groupby(['date'])['topics'].apply(','.join).reset_index()
df_topics_grp

Unnamed: 0,date,topics
0,2008-08-08,Health facilities; Kidney diseases; Medicare; ...
1,2008-08-11,"NoTopic,NoTopic,Income taxes; Reporting and re..."
2,2008-08-12,"NoTopic,NoTopic,NoTopic,Administrative practic..."
3,2008-08-13,"NoTopic,Grant programs-health; Health faciliti..."
4,2008-08-14,"NoTopic,NoTopic,NoTopic,Freedom of information..."
...,...,...
1969,2016-06-27,"NoTopic,Brokers; Reporting and recordkeeping r..."
1970,2016-06-28,Harbors; Marine safety; Navigation (water); Re...
1971,2016-06-29,"Fisheries; Fishing,Airspace; Navigation (air),..."
1972,2016-06-30,Air pollution control; Carbon monoxide; Enviro...


In [None]:
# Next, I want to group the raw_text_url column by date so all the urls for each date appear in the same cell.
# I use groupby date, then join the urls using apply, and reset the index
df_url_grp = df_docs.groupby(['date'])['raw_text_url'].apply(','.join).reset_index()
df_url_grp

Unnamed: 0,date,raw_text_url
0,2008-08-08,https://www.federalregister.gov/documents/full...
1,2008-08-11,https://www.federalregister.gov/documents/full...
2,2008-08-12,https://www.federalregister.gov/documents/full...
3,2008-08-13,https://www.federalregister.gov/documents/full...
4,2008-08-14,https://www.federalregister.gov/documents/full...
...,...,...
1969,2016-06-27,https://www.federalregister.gov/documents/full...
1970,2016-06-28,https://www.federalregister.gov/documents/full...
1971,2016-06-29,https://www.federalregister.gov/documents/full...
1972,2016-06-30,https://www.federalregister.gov/documents/full...


In [None]:
# Here I join all the columns into one dataframe. All the data is grouped by date.
df_synopsis_grp['group_count'] = df_count_dates['counts']
df_synopsis_grp['topic'] = df_topics_grp['topics']
df_synopsis_grp['agencies'] = df_agency_grp['agency_names']
df_synopsis_grp['text_url'] = df_url_grp['raw_text_url']

df_synopsis_grp

Unnamed: 0,date,synopsis,group_count,topic,agencies,text_url
0,2008-08-08,This final rule updates the payment rates used...,18,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...
1,2008-08-11,nanGeneral Services Acquisition Regulation; GS...,21,"NoTopic,NoTopic,Income taxes; Reporting and re...",General Services Administration;Health and Hum...,https://www.federalregister.gov/documents/full...
2,2008-08-12,"We, the U.S. Fish and Wildlife Service (Servic...",25,"NoTopic,NoTopic,NoTopic,Administrative practic...",Interior Department; Fish and Wildlife Service...,https://www.federalregister.gov/documents/full...
3,2008-08-13,The General Services Administration (GSA) is p...,28,"NoTopic,Grant programs-health; Health faciliti...",General Services Administration;Health and Hum...,https://www.federalregister.gov/documents/full...
4,2008-08-14,"We, the U.S. Fish and Wildlife Service (we or ...",19,"NoTopic,NoTopic,NoTopic,Freedom of information...",Interior Department; Fish and Wildlife Service...,https://www.federalregister.gov/documents/full...
...,...,...,...,...,...,...
1969,2016-06-27,The Coast Guard announces a public meeting to ...,31,"NoTopic,Brokers; Reporting and recordkeeping r...",Homeland Security Department; Coast Guard;Secu...,https://www.federalregister.gov/documents/full...
1970,2016-06-28,The Coast Guard proposes to establish a tempor...,40,Harbors; Marine safety; Navigation (water); Re...,Homeland Security Department; Coast Guard;Inte...,https://www.federalregister.gov/documents/full...
1971,2016-06-29,"In January 2011, NMFS implemented the trawl ra...",25,"Fisheries; Fishing,Airspace; Navigation (air),...",Commerce Department; National Oceanic and Atmo...,https://www.federalregister.gov/documents/full...
1972,2016-06-30,The Environmental Protection Agency (EPA) is p...,54,Air pollution control; Carbon monoxide; Enviro...,Environmental Protection Agency;Environmental ...,https://www.federalregister.gov/documents/full...


In [None]:
# There are no nulls.
df_synopsis_grp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1974 entries, 0 to 1973
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         1974 non-null   datetime64[ns]
 1   synopsis     1974 non-null   object        
 2   group_count  1974 non-null   int64         
 3   topic        1974 non-null   object        
 4   agencies     1974 non-null   object        
 5   text_url     1974 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 92.7+ KB


In [None]:
df_text = df_synopsis_grp

In [None]:
df_text['synopsis'] = df_text['synopsis'].map(lambda x: x.lstrip('nan/'))
df_text.head(1)

Unnamed: 0,date,synopsis,group_count,topic,agencies,text_url
0,2008-08-08,This final rule updates the payment rates used...,18,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...


In [None]:
df_text['agencies'] = df_text['agencies'].map(lambda x: x.lstrip('nan/'))
df_text.head(1)

Unnamed: 0,date,synopsis,group_count,topic,agencies,text_url
0,2008-08-08,This final rule updates the payment rates used...,18,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...


In [None]:
df_text['topic'] = df_text['topic'].map(lambda x: x.lstrip('nan/'))
df_text.head(1)

Unnamed: 0,date,synopsis,group_count,topic,agencies,text_url
0,2008-08-08,This final rule updates the payment rates used...,18,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...


In [None]:
df_text['topic'] = df_text['topic'].str.replace('NoTopic', '')
df_text['topic'] = df_text['topic'].str.replace(',', '')
df_text.head()

Unnamed: 0,date,synopsis,group_count,topic,agencies,text_url
0,2008-08-08,This final rule updates the payment rates used...,18,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...
1,2008-08-11,General Services Acquisition Regulation; GSAR ...,21,Income taxes; Reporting and recordkeeping requ...,General Services Administration;Health and Hum...,https://www.federalregister.gov/documents/full...
2,2008-08-12,"We, the U.S. Fish and Wildlife Service (Servic...",25,Administrative practice and procedure; Flood i...,Interior Department; Fish and Wildlife Service...,https://www.federalregister.gov/documents/full...
3,2008-08-13,The General Services Administration (GSA) is p...,28,Grant programs-health; Health facilities; Heal...,General Services Administration;Health and Hum...,https://www.federalregister.gov/documents/full...
4,2008-08-14,"We, the U.S. Fish and Wildlife Service (we or ...",19,Freedom of information; Reporting and recordke...,Interior Department; Fish and Wildlife Service...,https://www.federalregister.gov/documents/full...


In [None]:
with pd.option_context('display.max_colwidth', -1):
  try1 = df_text[df_text['date'] == "2009-05-11"]['agencies']
  display(try1)

186    NoAgencies;NoAgencies;NoAgencies;NoAgencies;NoAgencies;NoAgencies;NoAgencies;NoAgencies;Transportation Department; Federal Aviation Administration;Personnel Management Office;Interior Department; Surface Mining Reclamation and Enforcement Office;Health and Human Services Department; Food and Drug Administration;NoAgencies;Health and Human Services Department; Food and Drug Administration;NoAgencies
Name: agencies, dtype: object

In [None]:
df_text['agencies'] = df_text['agencies'].str.replace('NoAgencies', '')
df_text['agencies'] = df_text['agencies'].str.replace(',', '')
df_text.head()

Unnamed: 0,date,synopsis,group_count,topic,agencies,text_url
0,2008-08-08,This final rule updates the payment rates used...,18,Health facilities; Kidney diseases; Medicare; ...,Health and Human Services Department; Centers ...,https://www.federalregister.gov/documents/full...
1,2008-08-11,General Services Acquisition Regulation; GSAR ...,21,Income taxes; Reporting and recordkeeping requ...,General Services Administration;Health and Hum...,https://www.federalregister.gov/documents/full...
2,2008-08-12,"We, the U.S. Fish and Wildlife Service (Servic...",25,Administrative practice and procedure; Flood i...,Interior Department; Fish and Wildlife Service...,https://www.federalregister.gov/documents/full...
3,2008-08-13,The General Services Administration (GSA) is p...,28,Grant programs-health; Health facilities; Heal...,General Services Administration;Health and Hum...,https://www.federalregister.gov/documents/full...
4,2008-08-14,"We, the U.S. Fish and Wildlife Service (we or ...",19,Freedom of information; Reporting and recordke...,Interior Department; Fish and Wildlife Service...,https://www.federalregister.gov/documents/full...


In [None]:
df_text[df_text['agencies'].str.match('NoAgencies')]

Unnamed: 0,date,synopsis,group_count,topic,agencies,text_url


In [None]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1974 entries, 0 to 1973
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         1974 non-null   datetime64[ns]
 1   synopsis     1974 non-null   object        
 2   group_count  1974 non-null   int64         
 3   topic        1974 non-null   object        
 4   agencies     1974 non-null   object        
 5   text_url     1974 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 92.7+ KB


In [None]:
df_text.to_csv('docs_clean.csv', index=False)