# Filtering Through Original Data
We want to only submit relevant data to the API in order to reduce cost & time. Given a large collection of data, the easiest and first step to take is to strip out data into only jobs that fit into several categories already defined by LinkedIn. To further filter through the data, we will use a json file containing relevant companies, and only select jobs that are associated with that company. Several parts of the code can be changed as needed, such as removing the search for companies, and either adding or removing which data is extracted from the entire line of data. Make sure to edit the directory strings with paths that are specific to your machine or enviornment.

## Required Input & Files

A directory containing the data for all json files to iterate through.

A json file containing which companies to filter for.

An empty json file to write output to.

The Python package "pandas" installed.

In [None]:
pip install pandas

In [None]:
import pandas as pd
import json
import os

masterDirectory = "/Your/Directory/Here"
files = os.listdir(masterDirectory)
total_files = len(files)
print("Files in directory:", files)

company_data = {} #Go through the json file containing relevant company ids
with open("Path/To/JSON/Company/File", 'r') as f:
    for line in f:
        company = json.loads(line)
        company_id = company['compid']
        company_data[company_id] = company  # store company info by 'compid'

# Define the target industries
target_industries = {"financial services", "capital markets", "investment management"}

masterList = []

# Process each file
for file_index, individualFile in enumerate(files, start=1):
    if not individualFile.startswith("part-") or individualFile == ".DS_Store":
        print(f"Skipping file {file_index} of {total_files}: {individualFile} (not a valid JSON file)")
        continue
    print(f"Processing file {file_index} of {total_files}: {individualFile}")
    file_path = os.path.join(masterDirectory, individualFile)

    # Initialize an empty DataFrame
    df = pd.DataFrame()
    chunksize = 10000
    numLines = 570000  # Adjust as necessary for actual data
    chunks = []

    print("Loading user data chunks")
    with pd.read_json(file_path, lines=True, chunksize=chunksize) as reader:
        for i, chunk in enumerate(reader, start=1):
            chunks.append(chunk)
            print(f"Chunk {i} loaded for file {file_index} of {total_files}", end='\r')

    df = pd.concat(chunks, ignore_index=True)

     # List to store the selected data
    selected_data = []

    # Iterate through each row in the DataFrame
    print("Iterating through rows")
    for index, row in df.iterrows():
        experiences = row['experience']  # List of experience objects
        if index % 1000 == 0 or index == len(df) - 1:  # Progress every 1000 rows or at the end
            print(f"Processing row {index + 1} of {len(df)} for file {file_index} of {total_files}", end='\r')
        # Loop through each experience object, filtering for relevant industries & companies
        for exp in experiences:
            company = exp['company']
            if company is not None:
                industry = company.get('industry')
                if industry in target_industries:
                    linkedin_id = company.get('linkedin_id')
                    if linkedin_id in company_data:
                        title = exp.get('title')
                        # Extract the desired information
                        data = {
                            'employee_id': row['id'],
                            'company.linkedin_id': company.get('linkedin_id'),
                            'start_date': exp.get('start_date'),
                            'end_date': exp.get('end_date'),
                            'location_names': exp.get('location_names'),
                            'job_summary': exp.get('summary'),
                            'title.name': title.get('name') if title is not None else None,
                            'title.role': title.get('role') if title is not None else None,
                            'title.sub_roles': title.get('sub_roles', []) if title is not None else [],
                            'title.levels': title.get('levels', []) if title is not None else []
                        }
                        selected_data.append(data)
    masterList.append(selected_data)

print()

# Append data to a new line in a JSON file
json_output_file = "path/to/your/.json/output/file"
with open(json_output_file, 'a') as f:
    print("Master list size:", len(masterList))
    for file_data in masterList:
        for entry in file_data:
            f.write(json.dumps(entry) + "\n")