In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

df = pd.read_csv('animal_data.csv')

animal_name_mapping = {
    'dogs': 'Dog',
    'cattle': 'Cattle',
    'cats': 'Cat',
    'horses': 'Horse',
    'sheep': 'Sheep',
    'goats': 'Goat',
    'pigs': 'Pig',
    'chickens': 'Chicken',
    'rabbits': 'Rabbit',
    'ducks': 'Duck',
    'geese': 'Goose',
    'mice': 'Mouse',
    'rats': 'Rat'}

# Function to standardize animal names
def standardize_animal_name(name):
    name = name.strip()

    name_lower = name.lower()
    for key, value in animal_name_mapping.items():
        if name_lower == key:
            return value

    if name_lower.endswith('s') and not name_lower in ['species']:
        singular = name[:-1]
        if singular.lower() in [k.lower() for k in animal_name_mapping.values()]:
            for value in animal_name_mapping.values():
                if singular.lower() == value.lower():
                    return value
        return name.title()

df['AnimalName'] = df['AnimalName'].apply(standardize_animal_name)
symptom_columns = ['symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']

# Create a dictionary for common symptom spelling corrections
symptom_corrections = {
    'abnomalities': 'Abnormalities',
    'Abnormalalities': 'Abnormalities',
    'vommitting': 'Vomiting',
    'vomitting': 'Vomiting',
    'Aneamia': 'Anaemia',
    'Anemia': 'Anaemia',
    'Anoxeria': 'Anorexia',
    'Attacks': 'Attack',
    'diarhea': 'Diarrhea',
    'diarrhoea': 'Diarrhea',
    'Dull ness': 'Dullness',
    'Dull': 'Dullness',
    'Gasc': 'Gas',
    'Inappentence': 'Inappetence',
    'weekness': 'Weakness',
    'lethargy': 'Lethargy',
    'lethargic': 'Lethargy',
    'seizuers': 'Seizures',
    'seizuer': 'Seizures',
    'painfull': 'Painful'
}

# Function to standardize symptoms
def standardize_symptom(symptom):
    if pd.isna(symptom) or symptom == "None reported":
        return symptom

    symptom = symptom.strip()

    # Check if it's in our corrections dictionary (case insensitive)
    symptom_lower = symptom.lower()
    for key, value in symptom_corrections.items():
        if symptom_lower == key.lower():
            return value

    return symptom.capitalize()

# Apply standardization to all symptom columns
for col in symptom_columns:
    df[col] = df[col].apply(standardize_symptom)

# Create an unpivoted version for easier analysis
unpivoted_data = []

for idx, row in df.iterrows():
    for col in symptom_columns:
        if pd.notna(row[col]) and row[col] != "None reported":
            unpivoted_data.append({
                'AnimalName': row['AnimalName'],
                'Symptom': row[col],
                'Dangerous': row['Dangerous']
            })

unpivoted_df = pd.DataFrame(unpivoted_data)

# Features for Question 1: Symptom frequency and danger correlation
def create_symptom_frequency_features(df, unpivoted_df):
    # Symptom frequency by species
    symptom_by_species = unpivoted_df.groupby(['AnimalName', 'Symptom']).size().reset_index(name='Count')

    # Get total counts per species for percentages
    species_counts = unpivoted_df['AnimalName'].value_counts().reset_index()
    species_counts.columns = ['AnimalName', 'TotalSymptoms']

    # Merge to calculate percentages
    symptom_by_species = pd.merge(symptom_by_species, species_counts, on='AnimalName')
    symptom_by_species['Percentage'] = symptom_by_species['Count'] / symptom_by_species['TotalSymptoms']

    # Symptom danger correlation
    danger_correlation = unpivoted_df.groupby('Symptom')['Dangerous'].apply(
        lambda x: (x == 'Yes').mean()).reset_index()
    danger_correlation.columns = ['Symptom', 'DangerCorrelation']

    # Overall danger rate for comparison
    overall_danger_rate = (unpivoted_df['Dangerous'] == 'Yes').mean()

    # Calculate danger coefficient (how much more likely danger is with this symptom)
    danger_correlation['DangerCoefficient'] = danger_correlation['DangerCorrelation'] / overall_danger_rate

    return symptom_by_species, danger_correlation

# Features for Question 2: Species prone to dangerous conditions
def create_species_risk_features(df):
    # Species danger rate
    species_danger = df.groupby('AnimalName')['Dangerous'].apply(
        lambda x: (x == 'Yes').mean()
    ).reset_index()
    species_danger.columns = ['AnimalName', 'DangerRate']

    # Count cases for confidence calculation
    species_counts = df['AnimalName'].value_counts().reset_index()
    species_counts.columns = ['AnimalName', 'TotalCases']

    # Merge to get counts
    species_danger = pd.merge(species_danger, species_counts, on='AnimalName')

    # Calculate 95% confidence interval for danger rate
    # Using normal approximation to binomial (valid for large enough samples)
    species_danger['CI_Lower'] = species_danger.apply(
        lambda x: max(0, x['DangerRate'] - 1.96 * np.sqrt((x['DangerRate'] * (1 - x['DangerRate'])) / x['TotalCases'])),
        axis=1
    )
    species_danger['CI_Upper'] = species_danger.apply(
        lambda x: min(1, x['DangerRate'] + 1.96 * np.sqrt((x['DangerRate'] * (1 - x['DangerRate'])) / x['TotalCases'])),
        axis=1
    )

    # Calculate symptom count distribution
    # First, count non-NA symptoms per row
    df['SymptomCount'] = df[['symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']].apply(
        lambda x: sum(1 for item in x if pd.notna(item) and item != "None reported"),
        axis=1
    )

    # Symptom count stats by species
    symptom_count_stats = df.groupby('AnimalName')['SymptomCount'].agg(['mean', 'median', 'min', 'max']).reset_index()

    # Correlation between symptom count and danger by species
    symptom_danger_corr = []
    for animal in df['AnimalName'].unique():
        animal_df = df[df['AnimalName'] == animal]
        if len(animal_df) > 5:  # Only calculate if we have enough samples
            danger_numeric = (animal_df['Dangerous'] == 'Yes').astype(int)
            correlation = np.corrcoef(animal_df['SymptomCount'], danger_numeric)[0, 1]
            symptom_danger_corr.append({
                'AnimalName': animal,
                'Correlation': correlation
            })

    symptom_danger_corr_df = pd.DataFrame(symptom_danger_corr)

    return species_danger, symptom_count_stats, symptom_danger_corr_df

# Features for Question 3: Symptom combinations and danger likelihood
def create_symptom_combination_features(df):
    # Create combinations of symptoms
    symptom_combinations = []

    for idx, row in df.iterrows():
        symptoms = [row[f'symptoms{i}'] for i in range(1, 6)
                   if pd.notna(row[f'symptoms{i}']) and row[f'symptoms{i}'] != "None reported"]

        # Record individual symptoms
        for symptom in symptoms:
            symptom_combinations.append({
                'AnimalName': row['AnimalName'],
                'Combination': symptom,
                'CombinationType': 'Single',
                'Dangerous': row['Dangerous']
            })

        # Generate pairs of symptoms
        if len(symptoms) >= 2:
            for i in range(len(symptoms)):
                for j in range(i+1, len(symptoms)):
                    pair = f"{symptoms[i]} + {symptoms[j]}"
                    symptom_combinations.append({
                        'AnimalName': row['AnimalName'],
                        'Combination': pair,
                        'CombinationType': 'Pair',
                        'Dangerous': row['Dangerous']
                    })

        # Generate triplets of symptoms
        if len(symptoms) >= 3:
            for i in range(len(symptoms)):
                for j in range(i+1, len(symptoms)):
                    for k in range(j+1, len(symptoms)):
                        triplet = f"{symptoms[i]} + {symptoms[j]} + {symptoms[k]}"
                        symptom_combinations.append({
                            'AnimalName': row['AnimalName'],
                            'Combination': triplet,
                            'CombinationType': 'Triplet',
                            'Dangerous': row['Dangerous']
                        })

    # Create DataFrame from the combinations
    combinations_df = pd.DataFrame(symptom_combinations)

    # Calculate danger rate for each combination
    combination_stats = combinations_df.groupby(['Combination', 'CombinationType']).agg(
        Count=('Dangerous', 'count'),
        DangerousCount=('Dangerous', lambda x: (x == 'Yes').sum())
    ).reset_index()

    combination_stats['DangerRate'] = combination_stats['DangerousCount'] / combination_stats['Count']

    # Filter to combinations that appear at least 3 times for reliability
    combination_stats = combination_stats[combination_stats['Count'] >= 3]

    # Sort by danger rate and count
    combination_stats = combination_stats.sort_values(['DangerRate', 'Count'], ascending=[False, False])

    return combinations_df, combination_stats

# Features for Question 4: Species-specific patterns
def create_species_pattern_features(df, unpivoted_df):
    # Species-specific symptoms (distinctiveness score)
    # Calculate the proportion of each symptom within a species
    symptom_species_prop = unpivoted_df.groupby(['AnimalName', 'Symptom']).size().reset_index(name='Count')

    # Get total for each species
    species_totals = symptom_species_prop.groupby('AnimalName')['Count'].sum().reset_index()
    species_totals.columns = ['AnimalName', 'TotalSymptoms']

    # Calculate proportion within species
    symptom_species_prop = pd.merge(symptom_species_prop, species_totals, on='AnimalName')
    symptom_species_prop['PropWithinSpecies'] = symptom_species_prop['Count'] / symptom_species_prop['TotalSymptoms']

    # Calculate the overall proportion of this symptom across all species
    symptom_totals = symptom_species_prop.groupby('Symptom')['Count'].sum().reset_index()
    total_symptoms = symptom_species_prop['Count'].sum()
    symptom_totals['OverallProportion'] = symptom_totals['Count'] / total_symptoms

    # Merge back to calculate distinctiveness
    symptom_species_prop = pd.merge(symptom_species_prop, symptom_totals[['Symptom', 'OverallProportion']], on='Symptom')

    # Distinctiveness score: how much more common this symptom is in this species vs overall
    symptom_species_prop['Distinctiveness'] = symptom_species_prop['PropWithinSpecies'] / symptom_species_prop['OverallProportion']

    # Species-specific danger patterns
    # For each species, which symptoms correlate most with danger
    species_danger_patterns = []

    for animal in unpivoted_df['AnimalName'].unique():
        animal_data = unpivoted_df[unpivoted_df['AnimalName'] == animal]

        # Only proceed if we have enough data for this species
        if len(animal_data) < 10:
            continue

        # For each symptom in this species, calculate danger correlation
        for symptom in animal_data['Symptom'].unique():
            symptom_data = animal_data[animal_data['Symptom'] == symptom]
            if len(symptom_data) < 5:  # Skip if too few instances
                continue

            danger_rate = (symptom_data['Dangerous'] == 'Yes').mean()
            species_danger_patterns.append({
                'AnimalName': animal,
                'Symptom': symptom,
                'SymptomCount': len(symptom_data),
                'DangerRate': danger_rate
            })

    species_danger_patterns_df = pd.DataFrame(species_danger_patterns)

    # Calculate overall danger rate for each species for comparison
    species_overall_danger = unpivoted_df.groupby('AnimalName')['Dangerous'].apply(
        lambda x: (x == 'Yes').mean()
    ).reset_index()
    species_overall_danger.columns = ['AnimalName', 'OverallDangerRate']

    # Merge to calculate relative risk
    species_danger_patterns_df = pd.merge(species_danger_patterns_df,
                                         species_overall_danger,
                                         on='AnimalName')

    # Relative risk: how much more/less dangerous this symptom is compared to species average
    species_danger_patterns_df['RelativeRisk'] = species_danger_patterns_df['DangerRate'] / species_danger_patterns_df['OverallDangerRate']

    # Sort to find most predictive symptoms for each species
    species_danger_patterns_df = species_danger_patterns_df.sort_values(['AnimalName', 'RelativeRisk'], ascending=[True, False])

    return symptom_species_prop, species_danger_patterns_df

# Run all the feature creation functions
symptom_by_species, danger_correlation = create_symptom_frequency_features(df, unpivoted_df)
species_danger, symptom_count_stats, symptom_danger_corr_df = create_species_risk_features(df)
combinations_df, combination_stats = create_symptom_combination_features(df)
symptom_species_prop, species_danger_patterns_df = create_species_pattern_features(df, unpivoted_df)

symptom_by_species.to_csv('symptom_by_species.csv', index=False)
danger_correlation.to_csv('symptom_danger_correlation.csv', index=False)
species_danger.to_csv('species_danger_rates.csv', index=False)
symptom_count_stats.to_csv('symptom_count_by_species.csv', index=False)
symptom_danger_corr_df.to_csv('symptom_count_danger_correlation.csv', index=False)
combination_stats.to_csv('symptom_combination_analysis.csv', index=False)
symptom_species_prop.to_csv('species_specific_symptoms.csv', index=False)
species_danger_patterns_df.to_csv('species_danger_patterns.csv', index=False)

  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[None, :]


Data transformation complete. All derived datasets saved to CSV files.
Animal names standardized: 37 unique species found.
Animals with plural forms consolidated (e.g., 'Dogs' -> 'Dog').
Symptoms standardized to consistent capitalization.

Validation tests:
Checking 'Dog' entries:
AnimalName
Dog    97
Name: count, dtype: int64

Checking 'Cattle' entries:
AnimalName
Cattle    83
Name: count, dtype: int64


In [13]:
symptom_by_species['Symptom'].value_counts()

Unnamed: 0_level_0,count
Symptom,Unnamed: 1_level_1
Weight loss,23
Diarrhea,23
Fever,22
Depression,22
Loss of appetite,21
...,...
Head shking,1
Head tossing,1
Hoarseness,1
Isolation from flock,1


In [16]:
symptom_columns = ['symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
df['SymptomCount'] = df[symptom_columns].apply(
            lambda row: row[row != "None reported"].count(), axis=1)
df.head()

Unnamed: 0,AnimalName,symptoms1,symptoms2,symptoms3,symptoms4,symptoms5,Dangerous,SymptomCount
0,Dog,Fever,Diarrhea,Vomiting,Weight loss,Dehydration,Yes,5
1,Dog,Fever,Diarrhea,Coughing,Tiredness,Pains,Yes,5
2,Dog,Fever,Diarrhea,Coughing,Vomiting,Anorexia,Yes,5
3,Dog,Fever,Difficulty breathing,Coughing,Lethargy,Sneezing,Yes,5
4,Dog,Fever,Diarrhea,Coughing,Lethargy,Blue eye,Yes,5


In [17]:
df.to_csv('animals_data.csv', index=False)