In [3]:
import pandas as pd

# Define file paths
file_path_filtered_genes = r"C:\Users\trejan\Desktop\filtered_genes.csv"  # Filtered genes dataset
file_path_nutrition = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\model\train.csv"  # Nutrition dataset

# **Load both datasets**
try:
    filtered_genes_df = pd.read_csv(file_path_filtered_genes)
    nutrition_df = pd.read_csv(file_path_nutrition)

    # Display basic info
    def explore_data(df, name):
        print(f'\n{name} Dataset Info:')
        print(df.info())
        print(f'\n{name} Missing Values:')
        print(df.isnull().sum())
        print(f'\n{name} Sample Data:')
        print(df.head())

    # **Explore both datasets**
    explore_data(filtered_genes_df, 'Filtered Genes')
    explore_data(nutrition_df, 'Nutrition')

except Exception as e:
    print("Error loading datasets:", e)



Filtered Genes Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1935 entries, 0 to 1934
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   #tax_id                                1935 non-null   int64 
 1   GeneID                                 1935 non-null   int64 
 2   Symbol                                 1935 non-null   object
 3   LocusTag                               1935 non-null   object
 4   Synonyms                               1935 non-null   object
 5   dbXrefs                                1935 non-null   object
 6   chromosome                             1935 non-null   object
 7   map_location                           1935 non-null   object
 8   description                            1935 non-null   object
 9   type_of_gene                           1935 non-null   object
 10  Symbol_from_nomenclature_authority     1935 non-null  

In [2]:
import pandas as pd

# Correct file path
file_path = r"C:\Users\trejan\Desktop\Sem 2\Machine Learning\GenoneDataset\genes\gene_info\gene_info"

# Define output file
output_file = r"C:\Users\trejan\Desktop\filtered_genes.csv"

# Define the genes to filter
genes_of_interest = {'FTO', 'MC4R', 'LEPR', 'PPARG', 'ADIPOQ'}  # Use a set for faster lookups

# Define chunk size (adjust based on available memory)
chunk_size = 100000  # Adjust this value as needed

# Process data in chunks
try:
    # Open output file and write the header first
    with open(output_file, "w", encoding="utf-8", newline="") as out_csv:
        first_chunk = True  # Flag to write header only once

        # Read file in chunks
        for chunk in pd.read_csv(file_path, sep="\t", chunksize=chunk_size, engine="python"):
            # Ensure correct column name
            if "Symbol" not in chunk.columns:
                print("Error: Column 'Symbol' not found! Available columns:", chunk.columns)
                break  # Stop execution if column name is incorrect

            # Filter the chunk using 'Symbol' instead of 'Gene Name'
            filtered_chunk = chunk[chunk["Symbol"].isin(genes_of_interest)]

            # Append to file
            filtered_chunk.to_csv(out_csv, mode="a", header=first_chunk, index=False)
            first_chunk = False  # After first write, avoid writing headers again

    print("Filtered data saved successfully!")

except Exception as e:
    print("Error processing file:", e)


Filtered data saved successfully!


In [4]:
import pandas as pd
import numpy as np

# Number of synthetic profiles to generate
num_samples = 2000

# Define possible SNP variations for each gene (simulated)
gene_variants = {
    "MC4R": ["rs17782313_TT", "rs17782313_CT", "rs17782313_CC"],
    "PPARG": ["rs1801282_CC", "rs1801282_CG", "rs1801282_GG"],
    "FTO": ["rs9939609_TT", "rs9939609_AT", "rs9939609_AA"],
    "LEPR": ["rs1137101_GG", "rs1137101_AG", "rs1137101_AA"],
}

# Generate synthetic data
synthetic_data = {
    "Profile_ID": range(1, num_samples + 1),
    "Age": np.random.randint(18, 65, num_samples),
    "BMI": np.round(np.random.uniform(18.5, 40.0, num_samples), 1),
    "MC4R_Variant": np.random.choice(gene_variants["MC4R"], num_samples),
    "PPARG_Variant": np.random.choice(gene_variants["PPARG"], num_samples),
    "FTO_Variant": np.random.choice(gene_variants["FTO"], num_samples),
    "LEPR_Variant": np.random.choice(gene_variants["LEPR"], num_samples),
    "Physical_Activity": np.random.choice(["Low", "Moderate", "High"], num_samples),
    "Diet_Type": np.random.choice(["Balanced", "High-Fat", "High-Carb"], num_samples),
    "Obesity_Risk_Score": np.round(np.random.uniform(0.1, 1.0, num_samples), 2)
}

# Create DataFrame
synthetic_profiles_df = pd.DataFrame(synthetic_data)

# Save to CSV
synthetic_profiles_df.to_csv("genetic_profiles.csv", index=False)

print("Genetic profiles saved successfully!")
print(synthetic_profiles_df.head())  # Display the first few rows


Genetic profiles saved successfully!
   Profile_ID  Age   BMI   MC4R_Variant PPARG_Variant   FTO_Variant  \
0           1   51  24.5  rs17782313_TT  rs1801282_CC  rs9939609_TT   
1           2   54  39.1  rs17782313_TT  rs1801282_CG  rs9939609_AA   
2           3   43  29.4  rs17782313_CT  rs1801282_GG  rs9939609_AT   
3           4   43  33.7  rs17782313_CT  rs1801282_CG  rs9939609_TT   
4           5   26  21.3  rs17782313_TT  rs1801282_CG  rs9939609_AT   

   LEPR_Variant Physical_Activity  Diet_Type  Obesity_Risk_Score  
0  rs1137101_AG              High   High-Fat                0.42  
1  rs1137101_AG              High  High-Carb                0.40  
2  rs1137101_AA              High   Balanced                0.93  
3  rs1137101_AA              High   High-Fat                0.80  
4  rs1137101_AA              High   High-Fat                0.28  
