In [1]:
import pandas as pd
import os, requests, zipfile, warnings, time
from io import BytesIO
import datetime
import tqdm
import concurrent.futures
from threading import Lock
warnings.filterwarnings('ignore')

In [2]:
test_url = 'http://data.gdeltproject.org/gdeltv2/20250219000000.gkg.csv.zip'
base_url = 'http://data.gdeltproject.org/gdeltv2/'

In [3]:
def parse_data(data):
    df = pd.read_csv(
        data,
        delimiter='\t',
        encoding='latin',
        low_memory=False,
        header=None,
        usecols=[0, 1, 2, 3, 4, 7, 9, 13, 15, 17, 25, 26],
        names=[
            'GKGRECORDID',
            'V2.1DATE',
            'V2SOURCECOLLECTIONIDENTIFIER',
            'V2SOURCECOMMONNAME',
            'V2DOCUMENTIDENTIFIER',
            'V1THEMES',
            'V1LOCATIONS',
            'V1ORGANIZATIONS',
            'V1.5TONE',
            'V2GCAM',
            'V2.1TRANSLATIONINFO',
            'V2EXTRASXML'
        ]
    )
    return(df)

In [4]:
def download_unzip(url):
    """
    Downloads a zipped CSV file from the given URL, extracts the first file in the archive,
    and parses it into a pandas DataFrame using the parse_data function.

    Args:
        url (str): The URL to the .zip file containing the CSV.

    Returns:
        tuple: (DataFrame, status_code)
            DataFrame: The parsed data as a pandas DataFrame.
            status_code: The HTTP status code from the download request.
    """
    response = requests.get(url)
    status_code = response.status_code
    
    # Check if request was successful before proceeding
    if status_code != 200:
        return None, status_code
    
    else:
        # Create a BytesIO object from the downloaded content
        zip_bytes = BytesIO(response.content)

        # Open the zip file in memory
        with zipfile.ZipFile(zip_bytes, 'r') as zip_file:
            # Get the first file in the zip
            first_file = zip_file.namelist()[0]
            
            # Read the file content directly into memory
            with zip_file.open(first_file) as file:
                return(parse_data(file), status_code)

In [96]:
df,code=download_unzip(test_url)
print(code)

200


In [95]:
# testing a bad link
df, code = download_unzip('http://data.gdeltproject.org/gdeltv2/2029999.gkg.csv.zip')
print(code)

404


In [5]:
def generate_url_list(start_dt, end_dt, increment_minutes=15):
    url_list = []
    current_dt = start_dt
    
    while current_dt <= end_dt:
        # Format as YYYYMMDDHHMMSS
        datetime_str = current_dt.strftime('%Y%m%d%H%M%S')
        url_list.append(f'{base_url}{datetime_str}.gkg.csv.zip')
        
        # Add increment
        current_dt += datetime.timedelta(minutes=increment_minutes)
    
    return url_list

In [47]:
start = datetime.datetime(2018, 1, 1, 0, 0, 0)
end = datetime.datetime(2025, 5, 1, 0, 0, 0)
urls = generate_url_list(start, end, 15)
print(urls[0:10])

['http://data.gdeltproject.org/gdeltv2/20180101000000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101001500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101003000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101004500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101010000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101011500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101013000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101014500.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101020000.gkg.csv.zip', 'http://data.gdeltproject.org/gdeltv2/20180101021500.gkg.csv.zip']


In [6]:
# function to chunk datetime_strings into groups of 100
def chunk_datetime_strings(urls, chunk_size=100):
    return [urls[i:i + chunk_size] for i in range(0, len(urls), chunk_size)]

In [7]:
def download_chunk_parallel(chunk, max_workers=10):
    """
    Downloads and processes multiple GDELT CSV files in parallel using threading.
    
    This function downloads zipped CSV files from URLs, extracts and parses them into
    pandas DataFrames, and combines them into a single DataFrame. It includes error
    handling, logging of failed downloads, and automatic stopping if too many
    consecutive downloads fail.
    
    Args:
        chunk (list): List of URLs pointing to .zip files containing CSV data.
        max_workers (int, optional): Maximum number of concurrent threads for downloading.
            Defaults to 10.
    
    Returns:
        pandas.DataFrame or None: Combined DataFrame containing data from all successful
            downloads, or None if no downloads succeeded.
    
    Features:
        - Parallel downloading using ThreadPoolExecutor
        - Thread-safe logging of failed downloads to timestamped log file
        - Progress bar showing download status
        - Automatic stopping if last 10 consecutive downloads fail
        - Error handling for network issues and parsing problems
    
    Error Handling:
        - HTTP errors (non-200 status codes) are logged
        - Network exceptions are caught and logged
        - Failed downloads are written to 'failed_downloads_YYYYMMDD_HHMMSS.txt'
        - Function stops early if 10 consecutive downloads fail to prevent wasted resources
    
    Note:
        Uses the existing download_unzip() and parse_data() functions for individual file processing.
    """
    df_list = []
    response_codes = []
    current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f"failed_downloads_{current_time}.txt"
    lock = Lock()  # For thread-safe file writing
    
    def download_single_url(url):
        try:
            df, code = download_unzip(url)
            
            # Thread-safe logging of response codes
            with lock:
                response_codes.append(code)
                
                if code != 200:
                    with open(log_file, "a") as f:
                        f.write(f"{url} - Status Code: {code}\n")
                else:
                    return df
                    
                # Check last 10 response codes
                if len(response_codes) >= 10 and all(c != 200 for c in response_codes[-10:]):
                    raise Exception("Error: Last 10 downloads failed (status code != 200). Stopping.")

        except Exception as e:
            with lock:
                with open(log_file, "a") as f:
                    f.write(f"{url} - Error: {str(e)}\n")
            print(f"Failed to download or parse {url}: {str(e)}")
            return None

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all download tasks
        future_to_url = {executor.submit(download_single_url, url): url for url in chunk}
        
        # Collect results with progress bar
        for future in tqdm.tqdm(concurrent.futures.as_completed(future_to_url), 
                               total=len(chunk), desc="Downloading", unit="file"):
            try:
                result = future.result()
                if result is not None:
                    df_list.append(result)
            except Exception as e:
                print(f"Error in thread execution: {str(e)}")
                break

            time.sleep(0.3)
    
    if df_list:
        return pd.concat(df_list, ignore_index=True)
    else:
        print(f"None of the URLs worked.")
        return None

In [100]:
test=urls[0:10]
df=download_chunk_parallel(test, max_workers=2)
df

Downloading: 100%|██████████| 10/10 [00:03<00:00,  2.82file/s]


Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
0,20180101000000-0,20180101000000,1,stamfordadvocate.com,http://www.stamfordadvocate.com/news/politics/...,TAX_FNCACT;TAX_FNCACT_CHIEF;WB_696_PUBLIC_SECT...,"2#Florida, United States#US#USFL#27.8333#-81.7...",,"-3.64583333333333,3.38541666666667,7.03125,10....","wc:350,c12.1:42,c12.10:33,c12.12:8,c12.13:13,c...",,<PAGE_PRECISEPUBTIMESTAMP>20171231231600</PAGE...
1,20180101000000-1,20180101000000,1,thestar.com.my,https://www.thestar.com.my/business/business-n...,,1#Malaysia#MY#MY#2.5#112.5#MY;1#Japan#JA#JA#36...,public investment bank;zhejiang geely holding ...,"1.51098901098901,2.47252747252747,0.9615384615...","wc:671,c1.2:9,c1.3:1,c12.1:18,c12.10:59,c12.11...",,<PAGE_LINKS>https://www.thestar.com.my/Search<...
2,20180101000000-2,20180101000000,1,radaronline.com,https://radaronline.com/celebrity-news/bow-wow...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;TAX_WORLD...,1#United States#US#US#39.828175#-98.5795#US,,"4.66101694915254,5.50847457627119,0.8474576271...","wc:206,c12.1:31,c12.10:21,c12.12:6,c12.13:8,c1...",,<PAGE_LINKS>http://radaronline.com/tag/bow-wow...
3,20180101000000-3,20180101000000,1,sanantoniopost.com,http://www.sanantoniopost.com/news/255927392/g...,,"3#Houston, Texas, United States#US#USTX#29.763...",,"-0.704225352112676,1.93661971830986,2.64084507...","wc:527,c12.1:24,c12.10:43,c12.12:19,c12.13:7,c...",,<PAGE_LINKS>http://www.sanantoniopost.com/revi...
4,20180101000000-4,20180101000000,1,complex.com,http://www.complex.com/life/2017/12/dinosaur-e...,,"1#China#CH#CH#35#105#CH;4#Ganzhou, Jiangxi, Ch...",,"-0.495049504950495,1.48514851485149,1.98019801...","wc:187,c1.3:1,c12.1:9,c12.10:10,c12.12:4,c12.1...",,<PAGE_LINKS>http://www.cnn.com/2015/04/21/asia...
...,...,...,...,...,...,...,...,...,...,...,...,...
13853,20180101021500-1303,20180101021500,1,nintendo-insider.com,https://www.nintendo-insider.com/farming-simul...,,,giants software,"1.40646976090014,4.36005625879044,2.9535864978...","wc:667,c1.2:1,c1.3:1,c12.1:59,c12.10:70,c12.11...",,<PAGE_LINKS>https://www.nintendo-insider.com/f...
13854,20180101021500-1304,20180101021500,1,onlinenigeria.com,https://news2.onlinenigeria.com/news/general/6...,TAX_FNCACT;TAX_FNCACT_HOUSEMAID;TAX_FNCACT_WOM...,"1#Nigeria#NI#NI#10#8#NI;4#Joro, Borno, Nigeria...",,"-1.33779264214047,2.67558528428094,4.013377926...","wc:268,c12.1:16,c12.10:22,c12.12:13,c12.13:6,c...",,<PAGE_LINKS>https://news2.onlinenigeria.com/;h...
13855,20180101021500-1305,20180101021500,1,indiatimes.com,https://economictimes.indiatimes.com/news/inte...,RAPE;USPEC_POLICY1;TAX_FNCACT;TAX_FNCACT_CHIEF...,"2#Florida, United States#US#USFL#27.8333#-81.7...",,"-3.8265306122449,3.31632653061224,7.1428571428...","wc:356,c12.1:43,c12.10:33,c12.12:8,c12.13:13,c...",,<PAGE_LINKS>https://m.economictimes.com/topic/...
13856,20180101021500-1306,20180101021500,1,waateanews.com,https://www.waateanews.com/waateanews/x_news/M...,EDUCATION;ELECTION;,"4#Mangere, New Zealand (General), New Zealand#...",family support services mangere labour;televis...,"1.66666666666667,4.83333333333333,3.1666666666...","wc:551,c1.3:1,c12.1:34,c12.10:37,c12.12:12,c12...",,<PAGE_LINKS>http://www.waateanews.com;http://w...


In [101]:
df.columns

Index(['GKGRECORDID', 'V2.1DATE', 'V2SOURCECOLLECTIONIDENTIFIER',
       'V2SOURCECOMMONNAME', 'V2DOCUMENTIDENTIFIER', 'V1THEMES', 'V1LOCATIONS',
       'V1ORGANIZATIONS', 'V1.5TONE', 'V2GCAM', 'V2.1TRANSLATIONINFO',
       'V2EXTRASXML'],
      dtype='object')

In [8]:
def first_pass_clean(df):
    # Limit to include United States
    df = df[df['V1LOCATIONS'].str.contains('united states', case=False, na=False)]

    # Limit to include airlines
    df = df[df['V1ORGANIZATIONS'].str.contains(
        "airplane|airline|airport|Alaska Airlines|American Airlines|Delta Air Lines|Frontier Airlines|Hawaiian Airlines|JetBlue|Southwest Airlines|Spirit Airlines|Sun Country Airlines|United Airlines|Allegiant Air"
        , case=False, na=False, regex=True)]
    
    # Drop if missing fields
    df = df.dropna(
        subset=['GKGRECORDID', 'V2.1DATE', 'V2SOURCECOLLECTIONIDENTIFIER',
       'V2DOCUMENTIDENTIFIER', 'V1LOCATIONS',
       'V1ORGANIZATIONS', 'V1.5TONE', 'V2GCAM']
        )
    
    # reset the index
    df = df.reset_index(drop=True)

    return(df)

In [122]:
# try downloading, cleaning, and saving the first chunk
start = datetime.datetime(2018, 1, 1, 0, 0, 0)
end = datetime.datetime(2025, 5, 1, 0, 0, 0)
urls = generate_url_list(start, end, 15)

# Randomly shuffle the urls
import random
random.shuffle(urls)

chunked_datetime_strings = chunk_datetime_strings(urls, 1000)
chunk = chunked_datetime_strings[0]

df = download_chunk_parallel(chunk, max_workers=10)
df = first_pass_clean(df)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"../data/processed/gdelt_cleaned_{timestamp}.csv"
df.to_csv(output_file, index=False)


Downloading:   6%|▌         | 58/1000 [00:28<07:47,  2.02file/s]


: 

: 

In [None]:
df

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
0,20210922201500-119,20210922201500,1,gcaptain.com,https://gcaptain.com/operation-dgar-a-collecti...,MARITIME_INCIDENT;MARITIME;MANMADE_DISASTER_IM...,"4#Sofia, Sofiya-Grad, Bulgaria#BU#BU42#42.6833...",boeing;centers for disease control;washington ...,"-0.253899165759884,2.24882118244469,2.50272034...","wc:2446,c1.2:15,c12.1:109,c12.10:245,c12.11:1,...",,<PAGE_AUTHORS>Editorial</PAGE_AUTHORS><PAGE_PR...
1,20210922201500-142,20210922201500,1,iheart.com,https://kzbb.iheart.com/content/2021-09-22-uni...,TAX_FNCACT;TAX_FNCACT_EMPLOYEES;TRIAL;USPEC_PO...,1#United States#US#US#39.828175#-98.5795#US,united airlines,"-2.31362467866324,3.59897172236504,5.912596401...","wc:362,c1.4:2,c12.1:25,c12.10:32,c12.12:7,c12....",,<PAGE_LINKS>https://news.bloomberglaw.com/heal...
2,20210922201500-263,20210922201500,1,freebeacon.com,https://freebeacon.com/biden-administration/po...,LEADER;TAX_FNCACT;TAX_FNCACT_PRESIDENT;USPEC_P...,1#Afghanistan#AF#AF#33#66#AF;1#United States#U...,kabul hamid karzai international airport,"-4.40835266821346,3.01624129930394,7.424593967...","wc:387,c12.1:14,c12.10:26,c12.12:12,c12.13:2,c...",,<PAGE_LINKS>https://freebeacon.com/biden-admin...
3,20210922201500-517,20210922201500,1,theepochtimes.com,https://www.theepochtimes.com/lax-to-pilot-ear...,SOC_POINTSOFINTEREST;SOC_POINTSOFINTEREST_AIRP...,"2#California, United States#US#USCA#36.17#-119...",city council;performance commission;epoch time...,"-0.613496932515337,3.06748466257669,3.68098159...","wc:306,c12.1:15,c12.10:24,c12.12:11,c12.13:5,c...",,<PAGE_LINKS>https://www.theepochtimes.com/t-ea...
4,20210922201500-632,20210922201500,1,clevelandstar.com,https://www.clevelandstar.com/news/271259617/t...,TAX_DISEASE;TAX_DISEASE_COVID;SOC_POINTSOFINTE...,"1#Mexico#MX#MX#23#-102#MX;4#Montego Bay, Saint...",augustine medical services;technology platform...,"1.5180265654649,2.56166982922201,1.04364326375...","wc:935,c1.2:2,c12.1:40,c12.10:79,c12.12:14,c12...",,<PAGE_LINKS>https://pr.report/9jZ0XTSu;https:/...
...,...,...,...,...,...,...,...,...,...,...,...,...
92,20201003204500-681,20201003204500,1,dailymail.co.uk,https://www.dailymail.co.uk/news/article-88021...,GENERAL_HEALTH;MEDICAL;TAX_FNCACT;TAX_FNCACT_D...,"2#New York, United States#US#USNY#42.1497#-74....",duluth international airport;white house;veter...,"-2.1,1.13333333333333,3.23333333333333,4.36666...","wc:2763,c1.2:1,c12.1:190,c12.10:246,c12.12:76,...",,<PAGE_LINKS>https://www.dailymail.co.uk/news/c...
93,20201003204500-687,20201003204500,1,weny.com,https://www.weny.com/story/42717304/the-latest...,TAX_DISEASE;TAX_DISEASE_CORONAVIRUS;TAX_DISEAS...,"3#Capitol Hill, New Jersey, United States#US#U...",senate republicans;white house;christie;democr...,"-2.41545893719807,1.20772946859903,3.623188405...","wc:1492,c1.2:1,c1.3:1,c12.1:130,c12.10:118,c12...",,<PAGE_TITLE>The Latest: President Trump tweets...
94,20201003204500-763,20201003204500,1,newstoday.com.bd,http://www.newstoday.com.bd/index.php?option=d...,TAX_FNCACT;TAX_FNCACT_AIDE;TAX_FNCACT_STAFFER;...,"2#Minnesota, United States#US#USMN#45.7326#-93...",twitter;white house;justice department;trump o...,"-3.24149108589951,2.10696920583468,5.348460291...","wc:527,c12.1:42,c12.10:46,c12.12:20,c12.13:8,c...",,"<PAGE_TITLE>Hope Hicks, the White House counse..."
95,20201003204500-822,20201003204500,1,benzinga.com,https://www.benzinga.com/media/20/10/17767520/...,LEADER;ECON_STOCKMARKET;ELECTION;,1#United States#US#US#39.828175#-98.5795#US,morgan stanley;terex corporation;energy stocks...,"-0.151745068285281,1.82094081942337,1.97268588...","wc:583,c1.2:4,c1.3:1,c12.1:40,c12.10:55,c12.12...",,<PAGE_LINKS>http://www.benzinga.com/topic/barr...


In [78]:
df2=df.copy()

# Limit to include United States (removes about half of data)
df2 = df2[df2['V1LOCATIONS'].str.contains('united states', case=False, na=False)]


# Limit to include airlines
df2 = df2[df2['V1ORGANIZATIONS'].str.contains(
    "airplane|airline|airport|Alaska Airlines|American Airlines|Delta Air Lines|Frontier Airlines|Hawaiian Airlines|JetBlue|Southwest Airlines|Spirit Airlines|Sun Country Airlines|United Airlines|Allegiant Air"
    , case=False, na=False, regex=True)]

df2

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",google;big data solutions;devops summit power ...,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
577,20180101000000-577,20180101000000,1,vcstar.com,http://www.vcstar.com/story/money/nation-now/2...,TAX_FNCACT;TAX_FNCACT_INSIDER;EDUCATION;,"2#Arizona, United States#US#USAZ#33.7712#-111....",frank lloyd wright foundation;arizona biltmore...,"0.966183574879227,1.69082125603865,0.724637681...","wc:801,c1.3:2,c1.4:2,c12.1:42,c12.10:33,c12.12...",,<PAGE_LINKS>http://jobs.vcstar.com;http://www....
765,20180101000000-765,20180101000000,1,abc10.com,http://www.abc10.com/news/nation-world/legal-m...,CRIME_ILLEGAL_DRUGS;TAX_FNCACT;TAX_FNCACT_EVAN...,"2#Maine, United States#US#USME#44.6074#-69.397...",facebook;big tobacco;los angeles international...,"0.0676132521974306,2.77214334009466,2.70453008...","wc:1353,c1.2:11,c1.3:1,c12.1:80,c12.10:130,c12...",,<PAGE_LINKS>https://www.usatoday.com/story/new...
769,20180101000000-769,20180101000000,1,kiwiblog.co.nz,https://www.kiwiblog.co.nz/2018/01/general_deb...,TAX_WORLDARACHNIDS;TAX_WORLDARACHNIDS_TICKS;CR...,1#Mexico#MX#MX#23#-102#MX;1#United States#US#U...,fairport convention;washington post;veterans a...,"-1.22897800776197,3.6222509702458,4.8512289780...","wc:1396,c1.2:1,c1.3:8,c12.1:160,c12.10:150,c12...",,<PAGE_LINKS>;http://cci-reanalyzer.org/wx/Dail...
944,20180101000000-944,20180101000000,1,kiwiblog.co.nz,https://www.kiwiblog.co.nz/2018/01/general_deb...,TAX_WORLDARACHNIDS;TAX_WORLDARACHNIDS_TICKS;CR...,1#Mexico#MX#MX#23#-102#MX;1#United States#US#U...,fairport convention;washington post;veterans a...,"-1.22897800776197,3.6222509702458,4.8512289780...","wc:1396,c1.2:1,c1.3:8,c12.1:160,c12.10:150,c12...",,<PAGE_LINKS>;http://cci-reanalyzer.org/wx/Dail...
...,...,...,...,...,...,...,...,...,...,...,...,...
135832,20180102004500-1009,20180102004500,1,newson6.com,http://www.newson6.com/story/37170869/high-pro...,TRIAL;SOC_GENERALCRIME;KILL;GENERAL_HEALTH;MED...,"2#Oklahoma, United States#US#USOK#35.5376#-96....",tulsa international airport,"-7.38461538461538,1.23076923076923,8.615384615...","wc:291,c12.1:20,c12.10:36,c12.12:17,c12.13:11,...",,<PAGE_AUTHORS>Taylor Newcomb;By: Taylor Newcom...
135979,20180102004500-1156,20180102004500,1,byronnews.com.au,https://www.byronnews.com.au/news/premium-econ...,EPU_ECONOMY_HISTORIC;,"2#New York, United States#US#USNY#42.1497#-74....",new york times;singapore airline;singapore air...,"2.66666666666667,3.93939393939394,1.2727272727...","wc:1480,c1.1:1,c1.2:8,c1.4:1,c12.1:79,c12.10:1...",,<PAGE_LINKS>http://www.singaporeairlines.com/;...
136101,20180102004500-1278,20180102004500,1,startribune.com,http://www.startribune.com/airline-mechanical-...,GENERAL_HEALTH;MEDICAL;SOC_POINTSOFINTEREST;SO...,"3#Boston, Massachusetts, United States#US#USMA...",american airlines;massachusetts port authority...,"-1.48148148148148,0,1.48148148148148,1.4814814...","wc:127,c12.1:1,c12.10:3,c12.12:1,c12.13:1,c12....",,<PAGE_LINKS>http://m.startribune.com/nation/</...
136252,20180102004500-1429,20180102004500,1,cqnews.com.au,https://www.cqnews.com.au/news/premium-economy...,EPU_ECONOMY_HISTORIC;,1#Italy#IT#IT#42.833333#12.833333#IT;2#New Yor...,new york times;singapore airline;singapore air...,"2.65700483091787,3.98550724637681,1.3285024154...","wc:1482,c1.1:1,c1.2:8,c1.4:1,c12.1:82,c12.10:1...",,<PAGE_LINKS>http://www.singaporeairlines.com/;...


In [80]:
# Parse the semicolon-delimited V1ORGANIZATIONS column
df2['V1ORGANIZATIONS'] = df2['V1ORGANIZATIONS'].str.split(';')
# Explode the DataFrame to have one row per organization
df2 = df2.explode('V1ORGANIZATIONS')
# Remove leading and trailing whitespace from the organization names
df2['V1ORGANIZATIONS'] = df2['V1ORGANIZATIONS'].str.strip()
# Remove rows where V1ORGANIZATIONS is NaN or empty
df2 = df2[df2['V1ORGANIZATIONS'].notna() & (df2['V1ORGANIZATIONS'] != '')]



df2

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",google,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",big data solutions,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",devops summit power panel big data,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",marketing at cloud academy,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
573,20180101000000-573,20180101000000,1,sys-con.com,http://wearables.sys-con.com/node/4211457,TAX_FNCACT;TAX_FNCACT_DRIVERS;,"3#Lighthouse Point, Florida, United States#US#...",eventbrite financial technology,"1.09239872553482,3.03823395539372,1.9458352298...","wc:7547,c1.2:18,c1.3:10,c1.4:4,c12.1:491,c12.1...",,<PAGE_LINKS>;http://carmengonzalez.sys-con.com...
...,...,...,...,...,...,...,...,...,...,...,...,...
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",cherokee place united methodist church,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",charleston international airport,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",sandy hook elementary school in newton,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>
136302,20180102004500-1479,20180102004500,1,postandcourier.com,https://www.postandcourier.com/photo_galleries/,EDUCATION;,"3#Charleston, South Carolina, United States#US...",college of charleston on,"0.209424083769634,2.82722513089005,2.617801047...","wc:861,c1.1:2,c1.4:1,c12.1:41,c12.10:68,c12.11...",,<PAGE_AUTHORS>Post;Courier</PAGE_AUTHORS>


In [81]:
df2['V1ORGANIZATIONS'].value_counts().head(20)

V1ORGANIZATIONS
associated press                    154
united states                       145
china airlines                      144
pasadena police                     128
twitter                              95
american airlines                    79
facebook                             71
college football playoff             60
hawaiian airlines                    46
instagram                            44
arizona state university             43
frank lloyd wright foundation        43
sky harbor international airport     42
york guggenheim museum               42
first christian church               42
massachusetts port authority         41
cnn                                  40
arizona biltmore hotel               40
white house                          38
delta air lines                      38
Name: count, dtype: int64

In [73]:
df4=df2[df2['V1ORGANIZATIONS']=='american airlines'].copy()
df4

Unnamed: 0,GKGRECORDID,V2.1DATE,V2SOURCECOLLECTIONIDENTIFIER,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,V1LOCATIONS,V1ORGANIZATIONS,V1.5TONE,V2GCAM,V2.1TRANSLATIONINFO,V2EXTRASXML
3612,20180101003000-1053,20180101003000,1,wqcmfm.com,http://www.wqcmfm.com/syndicated-article/?id=5...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
4200,20180101004500-127,20180101004500,1,khar590.com,http://www.khar590.com/syndicated-article/?id=...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
4552,20180101004500-479,20180101004500,1,1640thechamp.com,http://www.1640thechamp.com/syndicated-article...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
6442,20180101010000-806,20180101010000,1,thezone1059.com,http://www.thezone1059.com/syndicated-article/...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-3.96825396825397,0.396825396825397,4.36507936...","wc:255,c12.1:14,c12.10:15,c12.12:8,c12.13:4,c1...",,
9563,20180101013000-904,20180101013000,1,rockthefox.com,http://www.rockthefox.com/syndicated-article/?...,,"3#Sioux Falls, South Dakota, United States#US#...",american airlines,"-4.7808764940239,0.398406374501992,5.179282868...","wc:258,c12.1:14,c12.10:13,c12.12:9,c12.13:2,c1...",,
...,...,...,...,...,...,...,...,...,...,...,...,...
134465,20180102003000-1263,20180102003000,1,am930theanswer.com,http://am930theanswer.com/news/national/airlin...,GENERAL_HEALTH;MEDICAL;SOC_POINTSOFINTEREST;SO...,"3#Boston, Massachusetts, United States#US#USMA...",american airlines,"-1.40845070422535,0,1.40845070422535,1.4084507...","wc:130,c12.1:1,c12.10:3,c12.12:1,c12.13:1,c12....",,<PAGE_LINKS>http://am930theanswer.com/news/nat...
134768,20180102003000-1566,20180102003000,1,newsadvance.com,http://www.newsadvance.com/news/national/wire/...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;SOC_POINT...,"3#Boston, Massachusetts, United States#US#USMA...",american airlines,"-1.48148148148148,0,1.48148148148148,1.4814814...","wc:130,c12.1:3,c12.10:5,c12.12:1,c12.13:2,c12....",,<PAGE_LINKS>http://www.wcvb.com/article/4-peop...
135711,20180102004500-888,20180102004500,1,americanlivewire.com,http://americanlivewire.com/2013-12-14-1-foot-...,,"3#Carrabassett, Maine, United States#US#USME#4...",american airlines,"-1.61507402422611,1.74966352624495,3.364737550...","wc:684,c1.2:1,c12.1:35,c12.10:66,c12.12:21,c12...",,<PAGE_LINKS>http://americanlivewire.com/author...
135761,20180102004500-938,20180102004500,1,kctv5.com,http://www.kctv5.com/story/37170886/mechanical...,TAX_ETHNICITY;TAX_ETHNICITY_AMERICAN;SOC_POINT...,"3#Boston, Massachusetts, United States#US#USMA...",american airlines,"-1.45985401459854,0,1.45985401459854,1.4598540...","wc:133,c12.1:3,c12.10:5,c12.12:1,c12.13:2,c12....",,<PAGE_LINKS>http://www.wcvb.com/article/4-peop...


In [70]:
df3=pd.DataFrame(df2['V1ORGANIZATIONS'].value_counts().reset_index())
df3

Unnamed: 0,V1ORGANIZATIONS,count
0,united states,11857
1,associated press,8914
2,twitter,6368
3,facebook,4249
4,cnn,2984
...,...,...
44959,seaman high school,1
44960,university of alabama birmingham,1
44961,florence events center,1
44962,georgia southern university jiann,1
