In [20]:
import pandas as pd
import json

In [21]:
def analyze_missing_values(json_data):
    data = json.loads(json_data)
    df = pd.DataFrame([{
        'firstName': record['patientDetails'].get('firstName', ''),
        'lastName': record['patientDetails'].get('lastName', ''),
        'birthDate': record['patientDetails'].get('birthDate', '')
    } for record in data])
    total_records = len(df)
    missing_values = {}
    for column in ['firstName', 'lastName', 'birthDate']:
        missing_count = df[column].fillna('').str.strip().eq('').sum()
        percentage = round((missing_count / total_records) * 100, 2)
        
        missing_values[column] = {
            'missing_count': missing_count,
            'total_records': total_records,
            'missing_percentage': percentage
        }
    
    return missing_values

In [22]:
with open('DataEngineeringQ2.json', 'r') as file:
    json_data = file.read()

results = analyze_missing_values(json_data)


In [23]:
for field, stats in results.items():
    print(f"\n{field}:")
    print(f"Total Records: {stats['total_records']}")
    print(f"Missing Count: {stats['missing_count']}")
    print(f"Missing Percentage: {stats['missing_percentage']}%")


firstName:
Total Records: 31
Missing Count: 0
Missing Percentage: 0.0%

lastName:
Total Records: 31
Missing Count: 22
Missing Percentage: 70.97%

birthDate:
Total Records: 31
Missing Count: 10
Missing Percentage: 32.26%


In [24]:
def calculate_female_percentage(json_string):
    data = json.loads(json_string)
    df = pd.DataFrame([{
        'gender': record['patientDetails'].get('gender', '')
    } for record in data])
    df['gender'] = df['gender'].fillna('').str.strip().str.upper()
    
    total_records = len(df)
    mode_gender = df[df['gender'] != '']['gender'].mode()
    mode_value = mode_gender.iloc[0] if not mode_gender.empty else ''
    df['gender_imputed'] = df['gender'].replace('', mode_value)
    females_after_imputation = len(df[df['gender_imputed'] == 'F'])

    female_percentage = round((females_after_imputation / total_records) * 100, 2)
    return female_percentage


In [25]:
results = calculate_female_percentage(json_data)


In [26]:
results

32.26

In [27]:
from datetime import datetime 
def analyze_age_groups(json_string):
    data = json.loads(json_string)
    
    df = pd.DataFrame([record['patientDetails'] for record in data])
    df['birthDate'] = pd.to_datetime(df['birthDate']).dt.tz_localize(None)

    current_date = datetime.now()
    df['age'] = df['birthDate'].apply(lambda x: 
        int((current_date - x).days / 365.25) if pd.notnull(x) else None)
    
    def assign_age_group(age):
        if pd.isna(age):
            return "Unknown"
        elif age <= 12:
            return "Child"
        elif age <= 19:
            return "Teen"
        elif age <= 59:
            return "Adult"
        else:
            return "Senior"
    
    df['ageGroup'] = df['age'].apply(assign_age_group)
    age_group_counts = df['ageGroup'].value_counts().to_dict()
    adult_count = age_group_counts.get('Adult', 0)
    known_age_count = len(df[df['ageGroup'] != 'Unknown'])
    
    return {
        'age_group_counts': age_group_counts,
        'adult_count': adult_count,
        'total_records': len(df),
        'known_age_count': known_age_count
    }


In [28]:
results = analyze_age_groups(json_data)
print(f"Adult Count: {results['adult_count']}")


Adult Count: 21


In [29]:
def analyze_medicine_count(json_string):
    data = json.loads(json_string)
    medicine_counts = [len(record['consultationData']['medicines']) 
                      for record in data]
    df = pd.DataFrame(medicine_counts, columns=['medicine_count'])
    average_medicines = round(df['medicine_count'].mean(), 2)
    return average_medicines

In [30]:
results = analyze_medicine_count(json_data)
results

2.13

In [31]:
from collections import Counter
def get_third_most_prescribed(json_string):
    data = json.loads(json_string)
    medicine_names = []
    for record in data:
        medicines = record['consultationData']['medicines']
        medicine_names.extend([med['medicineName'] for med in medicines])
    medicine_counts = Counter(medicine_names)
    df = pd.DataFrame.from_dict(medicine_counts, orient='index', columns=['count'])
    df = df.sort_values('count', ascending=False)
    df['rank'] = df['count'].rank(method='min', ascending=False)
    
    third_medicine = df[df['rank'] == 3]
    
    return {'third_medicine': {
            'name': third_medicine.index[0] if not third_medicine.empty else None,
            'count': int(third_medicine['count'].iloc[0]) if not third_medicine.empty else 0
        }}

In [32]:
results = get_third_most_prescribed(json_data)
print(f"3rd most prescribed medicine: {results['third_medicine']['name']}")
print(f"prescriptions: {results['third_medicine']['count']}")

3rd most prescribed medicine: C
prescriptions: 13


In [33]:
def analyze_medicine_status(json_string):
    data = json.loads(json_string)
    
    medicines = []
    for record in data:
        medicines.extend(record['consultationData']['medicines'])
    df = pd.DataFrame(medicines)
    

    total_medicines = len(df)
    active_count = df['isActive'].sum()
    inactive_count = total_medicines - active_count
    
    active_percentage = round((active_count / total_medicines * 100), 2)
    inactive_percentage = round((inactive_count / total_medicines * 100), 2)
    
    results = {
        'active': {
            'count': int(active_count),
            'percentage': active_percentage
        },
        'inactive': {
            'count': int(inactive_count),
            'percentage': inactive_percentage
        }
    }
    
    return results


In [34]:
results = analyze_medicine_status(json_data)
print(f"Active Medicines: {results['active']['percentage']}%")
print(f"Inactive Medicines: {results['inactive']['percentage']}%")

Active Medicines: 69.7%
Inactive Medicines: 30.3%


In [35]:
def is_valid_indian_mobile(phone_number):
    if not phone_number or not isinstance(phone_number, str):
        return False

    phone_number = phone_number.strip()
    
    if phone_number.startswith('+91'):
        phone_number = phone_number[3:]
    elif phone_number.startswith('91'):
        phone_number = phone_number[2:]
    if not phone_number.isdigit():
        return False
    
    if len(phone_number) != 10:
        return False
    
    number_int = int(phone_number)
    return 6000000000 <= number_int <= 9999999999

def process_phone_numbers(json_string):
    valid_count = 0
    data = json.loads(json_string)
    for entry in data:
        phone_number = entry['phoneNumber']
        is_valid = is_valid_indian_mobile(phone_number)
        entry['isValidMobile'] = is_valid
        if is_valid:
            valid_count += 1
    
    return valid_count

In [36]:
valid_count = process_phone_numbers(json_data)
print(f"Number of valid phone numbers: {valid_count}")

Number of valid phone numbers: 18


Pearson correlation formula:


r = Σ((xi - x̄)(yi - ȳ)) / √[Σ(xi - x̄)²][Σ(yi - ȳ)²]

In [37]:
import math
def calculate_pearson_correlation(data):
    data = json.loads(data)
    def calculate_age(birth_date_str):
        if not birth_date_str:
            return None
        try:
            birth_date = datetime.strptime(birth_date_str.split('T')[0], '%Y-%m-%d')
            reference_date = datetime(2024, 12, 13)
            age = reference_date.year - birth_date.year - (
                (reference_date.month, reference_date.day) < (birth_date.month, birth_date.day)
            )
            return age
        except (ValueError, AttributeError):
            return None

    valid_pairs = []
    
    for entry in data:
        
        birth_date = entry.get('patientDetails', {}).get('birthDate')
        age = calculate_age(birth_date)
        
        if age is not None:
            active_medicines = sum(
                1 for med in entry.get('consultationData', {}).get('medicines', [])
                if med.get('isActive', False)
            )
            
            valid_pairs.append((age, active_medicines))

    if len(valid_pairs) < 2:
        return 0, valid_pairs
    n = len(valid_pairs)
    mean_age = sum(p[0] for p in valid_pairs) / n
    mean_meds = sum(p[1] for p in valid_pairs) / n
    covariance = 0
    var_age = 0
    var_meds = 0

    for age, meds in valid_pairs:
        diff_age = age - mean_age
        diff_meds = meds - mean_meds
        covariance += diff_age * diff_meds
        var_age += diff_age * diff_age
        var_meds += diff_meds * diff_meds

    if var_age == 0 or var_meds == 0:
        return 0, valid_pairs

    correlation = covariance / (math.sqrt(var_age) * math.sqrt(var_meds))

    print(f"\nSummary Statistics:")
    print(f"Number of valid data points: {n}")
    print(f"Average age: {mean_age:.2f} years")
    print(f"Average number of active medicines: {mean_meds:.2f}")
    print(f"Pearson correlation coefficient: {correlation:.2f}")

    return correlation, valid_pairs

In [38]:
correlation, data_points = calculate_pearson_correlation(json_data)


Summary Statistics:
Number of valid data points: 21
Average age: 34.67 years
Average number of active medicines: 1.48
Pearson correlation coefficient: -0.07
