In [1]:
import pandas as pd
import os

# Set directories
base_dir = '/Users/shiveshjha/Downloads/TDTR_WITH_ML-MAIN/equation_generation/'
dataset_dir = os.path.join(base_dir, 'datasets_complex')
final_dir = os.path.join(base_dir, 'final_dataset')

# Create final_dataset directory if it doesn't exist
os.makedirs(final_dir, exist_ok=True)

# Read and merge all CSV files
all_csvs = [pd.read_csv(os.path.join(dataset_dir, f)) 
            for f in os.listdir(dataset_dir) 
            if f.endswith('.csv')]

# Concatenate all dataframes
merged_df = pd.concat(all_csvs, ignore_index=True)

# Save merged dataset
merged_df.to_csv(os.path.join(final_dir, 'complete_dataset.csv'), index=False)

print(f"Merged {len(all_csvs)} CSV files")
print(f"Final dataset shape: {merged_df.shape}")

Merged 10 CSV files
Final dataset shape: (5074000, 7)


In [2]:
df = pd.read_csv('./final_dataset/complete_dataset.csv')

In [3]:
df['Row_ID'] = range(1, len(df) + 1)

# If you want the Row_ID to be the first column, you can reorder columns like this:
cols = ['Row_ID'] + [col for col in df.columns if col != 'Row_ID']
df = df[cols]

In [4]:
df.to_csv('./final_dataset/complete_dataset.csv', index = False)

In [5]:
import pandas as pd
import re

def extract_coefficients(equation):
    """
    Extracts coefficients a, b, and c from a quadratic equation string of the form 'y = ax² + bx + c'
    
    Parameters:
    -----------
    equation : str
        The equation string to parse
        
    Returns:
    --------
    tuple
        The coefficients (a, b, c)
    """
    # Remove 'y = ' from the start
    equation = equation.replace('y = ', '')
    
    # Use regular expressions to extract the coefficients
    # Look for the pattern: number followed by 'x²', then number followed by 'x', then number
    pattern = r'([-\d.]+)x² *([+-] *[\d.]+)x *([+-] *[\d.]+)'
    match = re.match(pattern, equation)
    
    if match:
        # Convert matched strings to float numbers
        a = float(match.group(1))
        # Handle the sign in front of b
        b_str = match.group(2).replace(' ', '')
        b = float(b_str)
        # Handle the sign in front of c
        c_str = match.group(3).replace(' ', '')
        c = float(c_str)
        
        return a, b, c
    else:
        return None, None, None

# Read your existing CSV file
df = pd.read_csv('./final_dataset/complete_dataset.csv')  # Replace with your actual file path

# Extract coefficients and add new columns
df[['a_coefficient', 'b_coefficient', 'c_coefficient']] = (
    df['Equations'].apply(lambda x: pd.Series(extract_coefficients(x)))
)

In [6]:
# Save the modified DataFrame to a new CSV file
df.to_csv('./final_dataset/complete_dataset_with_coefficients.csv', index=False)