In [23]:
import pandas as pd
import os, requests, zipfile, warnings, time, random
from io import BytesIO
import datetime
import tqdm
import concurrent.futures
from threading import Lock
warnings.filterwarnings('ignore')

In [2]:
test_url = 'http://data.gdeltproject.org/gdeltv2/20250219000000.gkg.csv.zip'
base_url = 'http://data.gdeltproject.org/gdeltv2/'

In [3]:
def parse_data(data):
    df = pd.read_csv(
        data,
        delimiter='\t',
        encoding='latin',
        low_memory=False,
        header=None,
        usecols=[0, 1, 2, 3, 4, 7, 9, 13, 15, 17, 25, 26],
        names=[
            'GKGRECORDID',
            'V2.1DATE',
            'V2SOURCECOLLECTIONIDENTIFIER',
            'V2SOURCECOMMONNAME',
            'V2DOCUMENTIDENTIFIER',
            'V1THEMES',
            'V1LOCATIONS',
            'V1ORGANIZATIONS',
            'V1.5TONE',
            'V2GCAM',
            'V2.1TRANSLATIONINFO',
            'V2EXTRASXML'
        ]
    )
    return(df)

In [4]:
def download_unzip(url):
    """
    Downloads a zipped CSV file from the given URL, extracts the first file in the archive,
    and parses it into a pandas DataFrame using the parse_data function.

    Args:
        url (str): The URL to the .zip file containing the CSV.

    Returns:
        tuple: (DataFrame, status_code)
            DataFrame: The parsed data as a pandas DataFrame.
            status_code: The HTTP status code from the download request.
    """
    response = requests.get(url)
    status_code = response.status_code
    
    # Check if request was successful before proceeding
    if status_code != 200:
        return None, status_code
    
    else:
        # Create a BytesIO object from the downloaded content
        zip_bytes = BytesIO(response.content)

        # Open the zip file in memory
        with zipfile.ZipFile(zip_bytes, 'r') as zip_file:
            # Get the first file in the zip
            first_file = zip_file.namelist()[0]
            
            # Read the file content directly into memory
            with zip_file.open(first_file) as file:
                return(parse_data(file), status_code)

In [96]:
df,code=download_unzip(test_url)
print(code)

200


In [95]:
# testing a bad link
df, code = download_unzip('http://data.gdeltproject.org/gdeltv2/2029999.gkg.csv.zip')
print(code)

404


In [5]:
def generate_url_list(start_dt, end_dt, increment_minutes=15):
    url_list = []
    current_dt = start_dt
    
    while current_dt <= end_dt:
        # Format as YYYYMMDDHHMMSS
        datetime_str = current_dt.strftime('%Y%m%d%H%M%S')
        url_list.append(f'{base_url}{datetime_str}.gkg.csv.zip')
        
        # Add increment
        current_dt += datetime.timedelta(minutes=increment_minutes)
    
    return url_list

In [47]:
start = datetime.datetime(2018, 1, 1, 0, 0, 0)
end = datetime.datetime(2025, 5, 1, 0, 0, 0)
urls = generate_url_list(start, end, 15)
print(urls[0:10])

['http://data.gdeltproject.org/gdeltv2/20180101000000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101001500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101003000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101004500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101010000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101011500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101013000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101014500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101020000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101021500.gkg.csv.zip']


In [6]:
# function to chunk datetime_strings into groups of 100
def chunk_datetime_strings(urls, chunk_size=100):
    return [urls[i:i + chunk_size] for i in range(0, len(urls), chunk_size)]

In [22]:
def download_chunk_parallel(chunk, max_workers=10):
    """
    Downloads and processes multiple GDELT CSV files in parallel using threading.
    
    This function downloads zipped CSV files from URLs, extracts and parses them into
    pandas DataFrames, and combines them into a single DataFrame. It includes error
    handling, logging of failed downloads, and automatic stopping if too many
    consecutive downloads fail.
    
    Args:
        chunk (list): List of URLs pointing to .zip files containing CSV data.
        max_workers (int, optional): Maximum number of concurrent threads for downloading.
            Defaults to 10.
    
    Returns:
        pandas.DataFrame or None: Combined DataFrame containing data from all successful
            downloads, or None if no downloads succeeded.
    
    Features:
        - Parallel downloading using ThreadPoolExecutor
        - Thread-safe logging of failed downloads to timestamped log file
        - Progress bar showing download status
        - Automatic stopping if last 10 consecutive downloads fail
        - Error handling for network issues and parsing problems
    
    Error Handling:
        - HTTP errors (non-200 status codes) are logged
        - Network exceptions are caught and logged
        - Failed downloads are written to 'failed_downloads_YYYYMMDD_HHMMSS.txt'
        - Function stops early if 10 consecutive downloads fail to prevent wasted resources
    
    Note:
        Uses the existing download_unzip() and parse_data() functions for individual file processing.
    """
    df_list = []
    response_codes = []
    current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f"../logs/failed_downloads_{current_time}.txt"
    lock = Lock()  # For thread-safe file writing
    
    def download_single_url(url):
        try:
            df, code = download_unzip(url)
            
            # Thread-safe logging of response codes
            with lock:
                response_codes.append(code)
                
                if code != 200:
                    with open(log_file, "a") as f:
                        f.write(f"{url} - Status Code: {code}\n")
                else:
                    return df
                    
                # Check last 10 response codes
                if len(response_codes) >= 10 and all(c != 200 for c in response_codes[-10:]):
                    raise Exception("Error: Last 10 downloads failed (status code != 200). Stopping.")

        except Exception as e:
            with lock:
                with open(log_file, "a") as f:
                    f.write(f"{url} - Error: {str(e)}\n")
            print(f"Failed to download or parse {url}: {str(e)}")
            return None

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all download tasks
        future_to_url = {executor.submit(download_single_url, url): url for url in chunk}
        
        # Collect results with progress bar
        for future in tqdm.tqdm(concurrent.futures.as_completed(future_to_url), 
                               total=len(chunk), desc="Downloading", unit="file"):
            try:
                result = future.result()
                if result is not None:
                    df_list.append(result)
            except Exception as e:
                print(f"Error in thread execution: {str(e)}")
                break

            time.sleep(0.1)
    
    if df_list:
        return pd.concat(df_list, ignore_index=True)
    else:
        print(f"None of the URLs worked.")
        return None

In [100]:
test=urls[0:10]
df=download_chunk_parallel(test, max_workers=2)
df

Downloading: 100%|██████████| 10/10 [00:03<00:00,  2.82file/s]


Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
0,20180101000000-0,20180101000000,1,stamfordadvocate.com,http://www.stamfordadvocate.com/news/politics/...,TAX_FNCACT;TAX_FNCACT_CHIEF;WB_696_PUBLIC_SECT...,"2#Florida, United States#US#USFL#27.8333#-81.7...",,"-3.64583333333333,3.38541666666667,7.03125,10....","wc:350,c12.1:42,c12.10:33,c12.12:8,c12.13:13,c...",,<PAGE_PRECISEPUBTIMESTAMP>20171231231600</PAGE...
1,20180101000000-1,20180101000000,1,thestar.com.my,https://www.thestar.com.my/business/business-n...,,1#Malaysia#MY#MY#2.5#112.5#MY;1#Japan#JA#JA#36...,public investment bank;zhejiang geely holding ...,"1.51098901098901,2.47252747252747,0.9615384615...","wc:671,c1.2:9,c1.3:1,c12.1:18,c12.10:59,c12.11...",,<PAGE_LINKS>https://www.thestar.com.my/Search<...
2,20180101000000-2,20180101000000,1,radaronline.com,https://radaronline.com/celebrity-news/bow-wow...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;TAX_WORLD...,1#United States#US#US#39.828175#-98.5795#US,,"4.66101694915254,5.50847457627119,0.8474576271...","wc:206,c12.1:31,c12.10:21,c12.12:6,c12.13:8,c1...",,<PAGE_LINKS>http://radaronline.com/tag/bow-wow...
3,20180101000000-3,20180101000000,1,sanantoniopost.com,http://www.sanantoniopost.com/news/255927392/g...,,"3#Houston, Texas, United States#US#USTX#29.763...",,"-0.704225352112676,1.93661971830986,2.64084507...","wc:527,c12.1:24,c12.10:43,c12.12:19,c12.13:7,c...",,<PAGE_LINKS>http://www.sanantoniopost.com/revi...
4,20180101000000-4,20180101000000,1,complex.com,http://www.complex.com/life/2017/12/dinosaur-e...,,"1#China#CH#CH#35#105#CH;4#Ganzhou, Jiangxi, Ch...",,"-0.495049504950495,1.48514851485149,1.98019801...","wc:187,c1.3:1,c12.1:9,c12.10:10,c12.12:4,c12.1...",,<PAGE_LINKS>http://www.cnn.com/2015/04/21/asia...
...,...,...,...,...,...,...,...,...,...,...,...,...
13853,20180101021500-1303,20180101021500,1,nintendo-insider.com,https://www.nintendo-insider.com/farming-simul...,,,giants software,"1.40646976090014,4.36005625879044,2.9535864978...","wc:667,c1.2:1,c1.3:1,c12.1:59,c12.10:70,c12.11...",,<PAGE_LINKS>https://www.nintendo-insider.com/f...
13854,20180101021500-1304,20180101021500,1,onlinenigeria.com,https://news2.onlinenigeria.com/news/general/6...,TAX_FNCACT;TAX_FNCACT_HOUSEMAID;TAX_FNCACT_WOM...,"1#Nigeria#NI#NI#10#8#NI;4#Joro, Borno, Nigeria...",,"-1.33779264214047,2.67558528428094,4.013377926...","wc:268,c12.1:16,c12.10:22,c12.12:13,c12.13:6,c...",,<PAGE_LINKS>https://news2.onlinenigeria.com/;h...
13855,20180101021500-1305,20180101021500,1,indiatimes.com,https://economictimes.indiatimes.com/news/inte...,RAPE;USPEC_POLICY1;TAX_FNCACT;TAX_FNCACT_CHIEF...,"2#Florida, United States#US#USFL#27.8333#-81.7...",,"-3.8265306122449,3.31632653061224,7.1428571428...","wc:356,c12.1:43,c12.10:33,c12.12:8,c12.13:13,c...",,<PAGE_LINKS>https://m.economictimes.com/topic/...
13856,20180101021500-1306,20180101021500,1,waateanews.com,https://www.waateanews.com/waateanews/x_news/M...,EDUCATION;ELECTION;,"4#Mangere, New Zealand (General), New Zealand#...",family support services mangere labour;televis...,"1.66666666666667,4.83333333333333,3.1666666666...","wc:551,c1.3:1,c12.1:34,c12.10:37,c12.12:12,c12...",,<PAGE_LINKS>http://www.waateanews.com;http://w...


In [101]:
df.columns

Index(['GKGRECORDID', 'V2.1DATE', 'V2SOURCECOLLECTIONIDENTIFIER',
       'V2SOURCECOMMONNAME', 'V2DOCUMENTIDENTIFIER', 'V1THEMES', 'V1LOCATIONS',
       'V1ORGANIZATIONS', 'V1.5TONE', 'V2GCAM', 'V2.1TRANSLATIONINFO',
       'V2EXTRASXML'],
      dtype='object')

In [8]:
def first_pass_clean(df):
    # Limit to include United States
    df = df[df['V1LOCATIONS'].str.contains('united states', case=False, na=False)]

    # Limit to include airlines
    df = df[df['V1ORGANIZATIONS'].str.contains(
        "airplane|airline|airport|Alaska Airlines|American Airlines|Delta Air Lines|Frontier Airlines|Hawaiian Airlines|JetBlue|Southwest Airlines|Spirit Airlines|Sun Country Airlines|United Airlines|Allegiant Air"
        , case=False, na=False, regex=True)]
    
    # Drop if missing fields
    df = df.dropna(
        subset=['GKGRECORDID', 'V2.1DATE', 'V2SOURCECOLLECTIONIDENTIFIER',
       'V2DOCUMENTIDENTIFIER', 'V1LOCATIONS',
       'V1ORGANIZATIONS', 'V1.5TONE', 'V2GCAM']
        )
    
    # reset the index
    df = df.reset_index(drop=True)

    return(df)

In [24]:
# try downloading, cleaning, and saving the first chunk
start = datetime.datetime(2018, 1, 1, 0, 0, 0)
end = datetime.datetime(2025, 5, 1, 0, 0, 0)
urls = generate_url_list(start, end, 15)

# Randomly shuffle the urls
random.seed(1234)  # For reproducibility
random.shuffle(urls)

chunked_datetime_strings = chunk_datetime_strings(urls, 10)
chunks = chunked_datetime_strings[0:2]

for chunk in chunks:
    df = download_chunk_parallel(chunk, max_workers=10)
    df = first_pass_clean(df)

    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"../data/processed/gdelt_cleaned_{timestamp}.csv"
    df.to_csv(output_file, index=False)


Downloading: 100%|██████████| 10/10 [00:03<00:00,  3.26file/s]
Downloading: 100%|██████████| 10/10 [00:03<00:00,  2.75file/s]


In [25]:
df

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
0,20220205064500-218,20220205064500,1,pressrepublican.com,https://www.pressrepublican.com/news/cape-air-...,EPU_ECONOMY_HISTORIC;SOC_POINTSOFINTEREST;SOC_...,"3#Boston, Massachusetts, United States#US#USMA...",logan international airport;twitter;plattsburg...,"-1.17994100294985,0.442477876106195,1.62241887...","wc:608,c1.3:1,c12.1:27,c12.10:50,c12.11:3,c12....",,<PAGE_LINKS>http://regulations.gov</PAGE_LINKS...
1,20220205064500-315,20220205064500,1,thedailynewsonline.com,https://www.thedailynewsonline.com/news/olympi...,,"2#New York, United States#US#USNY#42.1497#-74....",google;u s congress;dreamworks;world trade org...,"-2.67379679144385,2.0855614973262,4.7593582887...","wc:1714,c1.1:1,c1.2:2,c1.3:1,c12.1:126,c12.10:...",,<PAGE_LINKS>http://NBCOlympics.com;http://www....
2,20220205064500-369,20220205064500,1,fox61.com,https://www.fox61.com/article/news/nation-worl...,CRISISLEX_CRISISLEXREC;SOC_POINTSOFINTEREST;SO...,1#Afghanistan#AF#AF#33#66#AF;1#United Kingdom#...,hamid karzai international airport on aug;depa...,"-7.70750988142292,0.395256916996047,8.10276679...","wc:908,c1.1:1,c1.3:2,c12.1:47,c12.10:98,c12.12...",,<PAGE_LINKS>https://www.dvidshub.net/video/830...
3,20240531074500-1,20240531074500,1,wfaa.com,https://www.wfaa.com/article/news/local/thursd...,DELAY;USPEC_UNCERTAINTY1;SOC_POINTSOFINTEREST;...,1#Mexico#MX#MX#23#-102#MX;1#United States#US#U...,american airlines,"-7.14285714285714,0.274725274725275,7.41758241...","wc:328,c12.1:21,c12.10:19,c12.12:8,c12.13:8,c1...",,<PAGE_LINKS>https://www.flightaware.com/</PAGE...
4,20240531074500-130,20240531074500,1,livemint.com,https://www.livemint.com/news/india/passengers...,MEDIA_MSM;TAX_FNCACT;TAX_FNCACT_JOURNALIST;WB_...,"4#Delhi, Delhi, India#IN#IN07#28.6667#77.2167#...",air india;delhi indira gandhi international ai...,"-2.22222222222222,2.77777777777778,5,7.7777777...","wc:337,c12.1:27,c12.10:23,c12.12:9,c12.13:8,c1...",,<PAGE_LINKS>https://www.livemint.com/companies...
...,...,...,...,...,...,...,...,...,...,...,...,...
97,20200430233000-2145,20200430233000,1,wfaa.com,https://www.wfaa.com/article/news/health/coron...,TAX_DISEASE;TAX_DISEASE_CORONAVIRUS;GENERAL_HE...,"2#New York, United States#US#USNY#42.1497#-74....",johns hopkins university;greta thunberg founda...,"-3.67114788004136,2.17166494312306,5.842812823...","wc:1738,c1.1:1,c1.2:8,c1.3:3,c12.1:73,c12.10:1...",,<PAGE_LINKS>https://coronavirus.jhu.edu/map.ht...
98,20200430233000-2148,20200430233000,1,milfordmirror.com,https://www.milfordmirror.com/business/article...,TAX_FNCACT;TAX_FNCACT_WOMAN;ARMEDCONFLICT;TAX_...,"3#Fremont, California, United States#US#USCA#3...",paper association;twitter;associated press;mic...,"-2.26377952755906,2.36220472440945,4.625984251...","wc:872,c1.2:14,c12.1:39,c12.10:83,c12.11:1,c12...",,<PAGE_PRECISEPUBTIMESTAMP>20200430184700</PAGE...
99,20200430233000-2228,20200430233000,1,wtsp.com,https://www.wtsp.com/article/travel/what-airli...,ARMEDCONFLICT;TAX_FNCACT;TAX_FNCACT_EMPLOYEES;...,1#United States#US#US#39.828175#-98.5795#US;4#...,google;united airlines;american airlines;front...,"0.428265524625268,1.49892933618844,1.070663811...","wc:415,c12.1:19,c12.10:27,c12.12:12,c12.13:5,c...",,<PAGE_LINKS>https://www.wtsp.com/article/news/...
100,20200430233000-2299,20200430233000,1,wxxinews.org,https://www.wxxinews.org/post/american-airline...,TAX_DISEASE;TAX_DISEASE_CORONAVIRUS;GENERAL_HE...,1#United States#US#US#39.828175#-98.5795#US,airlines for america;united airlines;american ...,"-3.09523809523809,1.66666666666667,4.761904761...","wc:375,c1.2:3,c12.1:27,c12.10:38,c12.11:1,c12....",,<PAGE_AUTHORS>David Schaper</PAGE_AUTHORS><PAG...


In [35]:
# More extensive cleaning
df2=df.copy()

# dates
df2['datetime'] = pd.to_datetime(df2['V2.1DATE'], format='%Y%m%d%H%M%S', errors='coerce')
df2['date'] = df2['datetime'].dt.date

# company Dummies
to_check= ["airplane","airline","airport","Alaska Airlines","American Airlines","Delta Air Lines","Frontier Airlines","Hawaiian Airlines","JetBlue","Southwest Airlines","Spirit Airlines","Sun Country Airlines","United Airlines","Allegiant Air"]
for word in to_check:
    df2[word] = df2['V1ORGANIZATIONS'].str.contains(word, case=False, na=False).astype(int)

# Extract the article title from the V2EXTRASXML column, which is between <PAGE_TITLE> and </PAGE_TITLE> 
df2['article_title'] = df2['V2EXTRASXML'].str.extract(r'<PAGE_TITLE>(.*?)</PAGE_TITLE>', expand=False)

# Split V1.5TONE into multiple columns using , as the delimiter
df2[['Tone','Positive Score','Negative Score',
     'Polarity','Activity Reference Density',
     'Self/Group Reference Density','Word Count']] = df2['V1.5TONE'].str.split(',', expand=True)

# Keep the first time that a V2DOCUMENTIDENTIFIER value appears
df2 = df2.sort_values(by=['V2DOCUMENTIDENTIFIER', 'datetime']).drop_duplicates(subset='V2DOCUMENTIDENTIFIER', keep='first')

df2.drop(columns=['V2.1DATE','V1LOCATIONS','V2EXTRASXML'], inplace=True)

df2

Unnamed: 0,GKGRECORDID,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,datetime,...,United Airlines,Allegiant Air,article_title,Tone,Positive Score,Negative Score,Polarity,Activity Reference Density,Self/Group Reference Density,Word Count
65,20180608124500-2560,1,jamaicaplaingazette.com,http://jamaicaplaingazette.com/2018/06/08/avia...,TAX_ETHNICITY;TAX_ETHNICITY_ENGLISH;TAX_WORLDL...,english hs army;boston logan international air...,"1.8348623853211,1.8348623853211,0,1.8348623853...","wc:104,c12.1:3,c12.10:8,c12.12:1,c12.13:3,c12....",,2018-06-08 12:45:00,...,0,0,,1.8348623853211,1.8348623853211,0,1.8348623853211,17.4311926605505,0,104
23,20190429011500-39,1,clarepeople.com,http://www.clarepeople.com/2014/01/16/shannon-...,SOC_POINTSOFINTEREST;SOC_POINTSOFINTEREST_AIRP...,dublin airport authority,"2.02429149797571,2.83400809716599,0.8097165991...","wc:222,c12.1:11,c12.10:21,c12.12:6,c12.13:9,c1...",,2019-04-29 01:15:00,...,0,0,,2.02429149797571,2.83400809716599,0.809716599190283,3.64372469635628,15.3846153846154,1.61943319838057,222
92,20200430233000-1697,1,firstlightradio.com,http://www.firstlightradio.com/news/americans-...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICANS;EPU_ECON...,airlines for america;boeing;association of fli...,"-0.795454545454545,2.38636363636364,3.18181818...","wc:811,c1.2:7,c12.1:69,c12.10:83,c12.11:1,c12....",,2020-04-30 23:30:00,...,0,0,Americans look to the road rather than the ski...,-0.795454545454545,2.38636363636364,3.18181818181818,5.56818181818182,27.3863636363636,0.909090909090909,811
63,20180608124500-1921,1,latimes.com,http://www.latimes.com/business/realestate/hot...,TAX_ETHNICITY;TAX_ETHNICITY_EUROPEANS;ECON_HOU...,janss co;corelogic;janss investment co;santa m...,"0.124069478908189,2.10918114143921,1.985111662...","wc:784,c1.1:1,c1.2:1,c12.1:37,c12.10:69,c12.11...",,2018-06-08 12:45:00,...,0,0,,0.124069478908189,2.10918114143921,1.98511166253102,4.09429280397022,20.8436724565757,0,784
61,20180608124500-1741,1,latimes.com,http://www.latimes.com/travel/la-tr-9-things-t...,WB_1979_NATURAL_RESOURCE_MANAGEMENT;WB_435_AGR...,shore hotel;colorado tourism office;natural re...,"1.42348754448399,3.02491103202847,1.6014234875...","wc:506,c1.1:1,c1.3:4,c12.1:33,c12.10:41,c12.12...",,2018-06-08 12:45:00,...,0,0,,1.42348754448399,3.02491103202847,1.60142348754448,4.62633451957295,24.3772241992883,2.31316725978648,506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,20200430233000-1160,1,wtsp.com,https://www.wtsp.com/article/news/health/coron...,TAX_DISEASE;TAX_DISEASE_CORONAVIRUS;GENERAL_HE...,centers for disease;united airlines;southwest ...,"0.722021660649819,2.16606498194946,1.444043321...","wc:247,c12.1:14,c12.10:20,c12.12:6,c12.13:8,c1...",,2020-04-30 23:30:00,...,1,0,"Coronavirus: Delta, American, Frontier require...",0.722021660649819,2.16606498194946,1.44404332129964,3.6101083032491,26.3537906137184,2.52707581227437,247
70,20200430233000-474,1,wtsp.com,https://www.wtsp.com/article/news/health/coron...,TAX_FNCACT;TAX_FNCACT_EDITOR;TAX_FNCACT_EMPLOY...,johns hopkins university;transportation securi...,"-2.17391304347826,1.86335403726708,4.037267080...","wc:302,c12.1:14,c12.10:40,c12.12:19,c12.13:10,...",,2020-04-30 23:30:00,...,0,0,"5 TSA employees have died from coronavirus, ov...",-2.17391304347826,1.86335403726708,4.03726708074534,5.90062111801242,20.1863354037267,0,302
99,20200430233000-2228,1,wtsp.com,https://www.wtsp.com/article/travel/what-airli...,ARMEDCONFLICT;TAX_FNCACT;TAX_FNCACT_EMPLOYEES;...,google;united airlines;american airlines;front...,"0.428265524625268,1.49892933618844,1.070663811...","wc:415,c12.1:19,c12.10:27,c12.12:12,c12.13:5,c...",,2020-04-30 23:30:00,...,1,0,What airlines are requiring customers wear a f...,0.428265524625268,1.49892933618844,1.07066381156317,2.56959314775161,30.6209850107066,0,415
100,20200430233000-2299,1,wxxinews.org,https://www.wxxinews.org/post/american-airline...,TAX_DISEASE;TAX_DISEASE_CORONAVIRUS;GENERAL_HE...,airlines for america;united airlines;american ...,"-3.09523809523809,1.66666666666667,4.761904761...","wc:375,c1.2:3,c12.1:27,c12.10:38,c12.11:1,c12....",,2020-04-30 23:30:00,...,1,0,American Airlines And United Report Nearly $4 ...,-3.09523809523809,1.66666666666667,4.76190476190476,6.42857142857143,24.047619047619,0.714285714285714,375


In [26]:
df['V2SOURCECOMMONNAME'].value_counts().head(50)

V2SOURCECOMMONNAME
iheart.com                    6
wfaa.com                      4
wickedlocal.com               4
wtsp.com                      3
king5.com                     3
latimes.com                   2
aero-news.net                 2
localmemphis.com              2
wmra.org                      2
yahoo.com                     2
oann.com                      1
pressrepublican.com           1
thedailynewsonline.com        1
fox61.com                     1
livemint.com                  1
news.com.au                   1
foxsanantonio.com             1
arkansasonline.com            1
thefinancialexpress.com.bd    1
kqky.com                      1
prnewswire.com                1
whbl.com                      1
nairametrics.com              1
firerescue1.com               1
wionews.com                   1
express.co.uk                 1
cbslocal.com                  1
nasa.gov                      1
timescolonist.com             1
patch.com                     1
clarepeople.com      

In [27]:
# Check if there are any duplicates in field V2DOCUMENTIDENTIFIER
duplicates = df[df.duplicated(subset=['V2DOCUMENTIDENTIFIER'], keep=False)]
duplicates

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML


In [29]:
df['V2EXTRASXML'][0]

'<PAGE_LINKS>http://regulations.gov</PAGE_LINKS><PAGE_AUTHORS>CARA CHAPMAN Press-Republican</PAGE_AUTHORS><PAGE_PRECISEPUBTIMESTAMP>20220205054500</PAGE_PRECISEPUBTIMESTAMP><PAGE_TITLE>Cape Air sole bid for funding at Plattsburgh International</PAGE_TITLE>'

In [13]:
df2=df.copy()

df2['datetime'] = pd.to_datetime(df2['V2.1DATE'], format='%Y%m%d%H%M%S', errors='coerce')
df2['date'] = df2['datetime'].dt.date

df2['date'].value_counts().sort_index()

date
2018-01-02    12
2018-01-03    43
2018-01-04    18
2018-01-05    45
2018-01-07    17
              ..
2025-04-19     7
2025-04-20     3
2025-04-22    10
2025-04-24     8
2025-04-30    18
Name: count, Length: 805, dtype: int64

In [17]:
# Extract the article title from the V2EXTRASXML column, which is between <PAGE_TITLE> and </PAGE_TITLE> 
df2['article_title'] = df2['V2EXTRASXML'].str.extract(r'<PAGE_TITLE>(.*?)</PAGE_TITLE>', expand=False)
df2['article_title'].value_counts().head(20)

article_title
                                                                                     27
Florida Woman Faked Medical Issue During Flight To Get Bigger Seat: Cops             22
Guitar signed by British music stars to go under the hammer                          15
Latest Articles                                                                      14
Biden 'completely' rules out quitting 2024 bid as he sits for TV interview           13
Today in History                                                                     13
News briefs                                                                          13
Heartwarming Pic Shows Medical Workers Flying To NYC To Help Fight COVID-19          13
Mother And Toddler Kicked Off Flight Because The Boy Refused To Wear A Mask          13
'Come From Away' Is Even More Potent In A Covid World &#x2014; Noise11.com | News    12
American Airlines plane is diverted after a passenger assaults a flight attendant    12
WATCH: Woman Bites

In [80]:
# Parse the semicolon-delimited V1ORGANIZATIONS column
df2['V1ORGANIZATIONS'] = df2['V1ORGANIZATIONS'].str.split(';')
# Explode the DataFrame to have one row per organization
df2 = df2.explode('V1ORGANIZATIONS')
# Remove leading and trailing whitespace from the organization names
df2['V1ORGANIZATIONS'] = df2['V1ORGANIZATIONS'].str.strip()
# Remove rows where V1ORGANIZATIONS is NaN or empty
df2 = df2[df2['V1ORGANIZATIONS'].notna() & (df2['V1ORGANIZATIONS'] != '')]



df2

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",google,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",big data solutions,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",devops summit power panel big data,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",marketing at cloud academy,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",eventbrite financial technology,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
...,...,...,...,...,...,...,...,...,...,...,...,...
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",cherokee place united methodist church,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",charleston international airport,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",sandy hook elementary school in newton,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",college of charleston on,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>


In [81]:
df2['V1ORGANIZATIONS'].value_counts().head(20)

V1ORGANIZATIONS
associated press                    154
united states                       145
china airlines                      144
pasadena police                     128
twitter                              95
american airlines                    79
facebook                             71
college football playoff             60
hawaiian airlines                    46
instagram                            44
arizona state university             43
frank lloyd wright foundation        43
sky harbor international airport     42
york guggenheim museum               42
first christian church               42
massachusetts port authority         41
cnn                                  40
arizona biltmore hotel               40
white house                          38
delta air lines                      38
Name: count, dtype: int64

In [73]:
df4=df2[df2['V1ORGANIZATIONS']=='american airlines'].copy()
df4

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
3612,20180101003000-1053,20180101003000,1,wqcmfm.com,http://www.wqcmfm.com/syndicated-article/?id=5...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
4200,20180101004500-127,20180101004500,1,khar590.com,http://www.khar590.com/syndicated-article/?id=...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
4552,20180101004500-479,20180101004500,1,1640thechamp.com,http://www.1640thechamp.com/syndicated-article...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
6442,20180101010000-806,20180101010000,1,thezone1059.com,http://www.thezone1059.com/syndicated-article/...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-3.96825396825397,0.396825396825397,4.36507936...","wc:255,c12.1:14,c12.10:15,c12.12:8,c12.13:4,c1...",,
9563,20180101013000-904,20180101013000,1,rockthefox.com,http://www.rockthefox.com/syndicated-article/?...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
...,...,...,...,...,...,...,...,...,...,...,...,...
134465,20180102003000-1263,20180102003000,1,am930theanswer.com,http://am930theanswer.com/news/national/airlin...,GENERAL_HEALTH;MEDICAL;SOC_POINTSOFINTEREST;SO...,"3#Boston, Massachusetts, United States#US#USMA...",american airlines,"-1.40845070422535,0,1.40845070422535,1.4084507...","wc:130,c12.1:1,c12.10:3,c12.12:1,c12.13:1,c12....",,<PAGE_LINKS>http://am930theanswer.com/news/nat...
134768,20180102003000-1566,20180102003000,1,newsadvance.com,http://www.newsadvance.com/news/national/wire/...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;SOC_POINT...,"3#Boston, Massachusetts, United States#US#USMA...",american airlines,"-1.48148148148148,0,1.48148148148148,1.4814814...","wc:130,c12.1:3,c12.10:5,c12.12:1,c12.13:2,c12....",,<PAGE_LINKS>http://www.wcvb.com/article/4-peop...
135711,20180102004500-888,20180102004500,1,americanlivewire.com,http://americanlivewire.com/2013-12-14-1-foot-...,,"3#Carrabassett, Maine, United States#US#USME#4...",american airlines,"-1.61507402422611,1.74966352624495,3.364737550...","wc:684,c1.2:1,c12.1:35,c12.10:66,c12.12:21,c12...",,<PAGE_LINKS>http://americanlivewire.com/author...
135761,20180102004500-938,20180102004500,1,kctv5.com,http://www.kctv5.com/story/37170886/mechanical...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;SOC_POINT...,"3#Boston, Massachusetts, United States#US#USMA...",american airlines,"-1.45985401459854,0,1.45985401459854,1.4598540...","wc:133,c12.1:3,c12.10:5,c12.12:1,c12.13:2,c12....",,<PAGE_LINKS>http://www.wcvb.com/article/4-peop...


In [70]:
df3=pd.DataFrame(df2['V1ORGANIZATIONS'].value_counts().reset_index())
df3

Unnamed: 0,V1ORGANIZATIONS,count
0,united states,11857
1,associated press,8914
2,twitter,6368
3,facebook,4249
4,cnn,2984
...,...,...
44959,seaman high school,1
44960,university of alabama birmingham,1
44961,florence events center,1
44962,georgia southern university jiann,1
