### Get LS Data

In [1]:
import os
import json
import re
import requests
import time
import pandas as pd

In [2]:
def json_files_to_df(folder_path):
    """
    Read JSON files with member data and convert to pandas DataFrame
    """
    
    all_members = []
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    
    # Process each JSON file in folder
    for json_file in json_files:  # Only process JSON files
        file_path = os.path.join(folder_path, json_file)
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
                
            # Extract member list and metadata from JSON structure
            if 'membersDtoList' in data:
                members = data['membersDtoList']
                metadata = data['metaDatasDto']
                
                # Append metadata and filename to each member record
                filename = json_file.replace('.json', '')
                for member in members:
                    member.update(metadata)
                    member['source_file'] = filename
                
                all_members.extend(members)
        
        except Exception as e:
            print(f"Error reading {json_file}: {e}")
            continue
    
    # Convert to DataFrame
    df = pd.DataFrame(all_members)
    
    # Print summary stats
    print(f"Total JSON files processed: {len(json_files)}")
    print(f"Total members extracted: {len(df)}")
    
    # Get unique MPs per file
    unique_mps = df.groupby('source_file')['mpsno'].nunique().reset_index()
    unique_mps.columns = ['filename', 'unique_mps']
    print("\nUnique MPs per file:")
    print(unique_mps)
    
    return df

In [3]:
# Example usage:
df = json_files_to_df("./")

df.shape

Total JSON files processed: 7
Total members extracted: 3740

Unique MPs per file:
  filename  unique_mps
0    ls_12         529
1    ls_13         551
2    ls_14         554
3    ls_15         511
4    ls_16         512
5    ls_17         539
6    ls_18         544


(3740, 42)

In [4]:
df_long = df.explode('email')

In [5]:
df_long['email_fix'] = (
    df_long['email']
    .str.strip()
    .str.replace(r'\[at\]', '@', regex=True)
    .str.replace(r'\[dot\]', '.', regex=True)
    .str.extract(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})')
)

In [6]:
# Valid emails or not
df_long['email_fix'].dropna()
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
df_long['email_fix'].str.match(email_pattern, na=False).sum()

4655

In [7]:
df_long.shape

(5563, 43)

In [8]:
df_long.to_csv("../ls_long.csv", index = False)