In [1]:
import os
import pandas as pd

###Variable Cleaning for Carvana

In [2]:
# List of columns to be removed
columns_to_remove = [
    'analyticsOnlyGetItByDays', 'backSeatImageUrl', 'earliestDaysToDeliveryOrPickupType',
    'earliestLocalLocation_activityType', 'entireDashImageUrl', 'imageUrl',
    'infotainmentImageUrl', 'isNewSinceLastVisit', 'isOnDemand', 'isPurchasePending',
    'jellyBeanDesktopUrl', 'passengerSeatImageUrl', 'previousPrice', 'price_discounts',
    'price_evTaxCredit', 'price_evTaxCreditAppliedPrice', 'price_feesMinusShipping',
    'price_pricingErrorMessage', 'price_totalDiscountAmount', 'price_totalFees', 'vehicleLockType','jellyBeanMobileUrl',
    'price_totalTaxes', 'price_transportCost', 'splattingVideos', 'steeringImageUrl',
    'stockRecallStatusType', 'storeKey', 'stores', 'textlessUrl', 'transportCost',
    'trunkImageUrl', 'vehicleReservableReasons', 'vehicleTags', 'priceUpdateDate', 'vehiclePurchaseType'
]

# Function to process each CSV file
def process_csv_files(input_directory, output_directory):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(input_directory, filename)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Drop the columns from the DataFrame
            df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])

            # Create the new filename
            new_filename = f"modified_{filename}"
            new_file_path = os.path.join(output_directory, new_filename)

            # Save the modified DataFrame to a new CSV file in the output directory
            df.to_csv(new_file_path, index=False)
            print(f"Processed {filename} -> {new_filename}")

# Specify the directories
input_directory = '/content/Car_data'
output_directory = '/content/modified_car_details'  # Directory to save the modified CSV files

# Process the CSV files in the specified directories
process_csv_files(input_directory, output_directory)


Processed MA_vehicles.csv -> modified_MA_vehicles.csv
Processed GA_vehicles.csv -> modified_GA_vehicles.csv
Processed OR_vehicles.csv -> modified_OR_vehicles.csv
Processed TX_vehicles.csv -> modified_TX_vehicles.csv
Processed CA_vehicles.csv -> modified_CA_vehicles.csv
Processed FL_vehicles.csv -> modified_FL_vehicles.csv


###Store unique car details

In [5]:
# Directory where the CSV files are stored
directory = '/content/modified_car_details'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Empty list to collect dataframes
dfs = []

# Read each CSV file and add the dataframe to the list
for file in csv_files:
    file_path = os.path.join(directory, file)  # Full path to the CSV file
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        dfs.append(df)
    else:
        print(f"File {file_path} does not exist.")

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Remove duplicates based on the "vin" column
unique_df = combined_df.drop_duplicates(subset=['vin'], keep='first')

# Print the number of unique records
print(f"Number of refetch unique records found: {len(unique_df)}")

# Save the unique records to a new CSV file
unique_df.to_csv('refetch_unique_car_records.csv', index=False)

print("Unique records have been saved to 'refetch_unique_car_records.csv'")


Number of refetch unique records found: 10862
Unique records have been saved to 'refetch_unique_car_records.csv'


###Merge unique_car_records.cvs and refetch_unique_car_records.csv

In [6]:
# Load the CSV files into pandas DataFrames
unique_car_records = pd.read_csv('unique_car_records.csv')
refetch_unique_car_records = pd.read_csv('refetch_unique_car_records.csv')

# Find the common 'vin' values between the two DataFrames
common_vins = pd.merge(unique_car_records[['vin']], refetch_unique_car_records[['vin']], on='vin', how='inner')

# Print the number of common records
print(f"Number of common records based on 'vin': {common_vins.shape[0]}")

# Combine both DataFrames
combined_df = pd.concat([unique_car_records, refetch_unique_car_records])

# Drop duplicate rows based on the 'vin' column, keeping the first occurrence
final_unique_records = combined_df.drop_duplicates(subset='vin', keep='first')

# Save the final unique records to a new CSV file
final_unique_records.to_csv('final_unique_records.csv', index=False)

# Print the number of records in the final CSV file
print(f"Number of records in the final unique CSV file: {final_unique_records.shape[0]}")

print("Merging complete. The final unique records have been saved to 'final_unique_records.csv'.")


Number of common records based on 'vin': 0
Number of records in the final unique CSV file: 45536
Merging complete. The final unique records have been saved to 'final_unique_records.csv'.


##Compare the records between TrueCars and Carvana  

In [None]:
# Load the CSV files into DataFrames
truecars_df = pd.read_csv('TrueCars_data.csv')
unique_cars_df = pd.read_csv('unique_car_records.csv')

# Convert VIN columns to uppercase (to ensure case-insensitivity)
truecars_df['VIN'] = truecars_df['VIN'].str.upper()
unique_cars_df['vin'] = unique_cars_df['vin'].str.upper()

# Find the common VINs between the two DataFrames
common_vins = pd.merge(truecars_df, unique_cars_df, left_on='VIN', right_on='vin')

# Find VINs only in unique_car_records.csv
unique_in_unique_cars = unique_cars_df[~unique_cars_df['vin'].isin(truecars_df['VIN'])]

# Find VINs only in TrueCars_data.csv
unique_in_truecars = truecars_df[~truecars_df['VIN'].isin(unique_cars_df['vin'])]

# Count the number of rows in each DataFrame
common_count = common_vins.shape[0]
unique_in_unique_cars_count = unique_in_unique_cars.shape[0]
unique_in_truecars_count = unique_in_truecars.shape[0]

# Output the results
print(f"Common VINs count: {common_count}")
print(f"VINs in unique_car_records but not in TrueCars_data: {unique_in_unique_cars_count}")
print(f"VINs in TrueCars_data but not in unique_car_records: {unique_in_truecars_count}")

# You can save the DataFrames if needed
common_vins.to_csv('common_vins.csv', index=False)
unique_in_unique_cars.to_csv('unique_in_unique_car_records.csv', index=False)
unique_in_truecars.to_csv('unique_in_truecars.csv', index=False)



Common VINs count: 0
VINs in unique_car_records but not in TrueCars_data: 34674
VINs in TrueCars_data but not in unique_car_records: 45920

Common VINs DataFrame:
Empty DataFrame
Columns: [Car Name, Car Webpage, Car Health, Price, Car Exterior Color, Car Interior Color, Miles, Fuel Type, Fuel Efficiency (mileage), EPA Range, Transmission, Drivetrain, Engine, Location, Listing Status, VIN, Stock Number, Accidents, Owners, Car Title Status, Use Type, Vehicle Last Inspected, Make, Model, Trim, Body Class, Model Year, Seating Capacity, bodyStyle, color, drivingMiles, earliestLocalLocation_locationId, fuelType, interiorColor, kbbTrim, locationId, make, mileage, milesPerGallon, model, parentModel, price_kbbValue, price_total, seatingCapacity, stockNumber, trim, vdpSlug, vehicleId, vehicleInventoryType, vin, year, zip_code]
Index: []

[0 rows x 52 columns]

Unique VINs in unique_car_records.csv:
         bodyStyle   color  drivingMiles  earliestLocalLocation_locationId  \
0      Convertible  

##Creating Sample Data

In [7]:
# Function to create a sample CSV with 50 random records
def create_sample_csv(input_file, output_file, num_samples=50):
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(input_file)

    # Ensure that the number of samples does not exceed the number of available records
    if len(df) < num_samples:
        print(f"Warning: There are only {len(df)} records in the file. The sample will contain all of them.")
        num_samples = len(df)

    # Take a random sample of the data
    sample_df = df.sample(n=num_samples, random_state=42)  # random_state for reproducibility

    # Write the sample to a new CSV file
    sample_df.to_csv(output_file, index=False)
    print(f"A random sample of {num_samples} records has been saved to {output_file}")

# Specify the input and output file names
input_csv_file = 'final_unique_records.csv'
output_csv_file = 'sample_final_unique_carvana_data.csv'

# Call the function to create the sample CSV
create_sample_csv(input_csv_file, output_csv_file)


A random sample of 50 records has been saved to sample_final_unique_carvana_data.csv


##Tried scarping from VIN-focused endpoint

In [None]:
import requests

In [None]:
response = requests.get("https://www.carvana.com/vehicle/3494097?refSource=srp")

In [None]:
response.text

'<!DOCTYPE html><html lang="en-US"><head><title>Just a moment...</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><meta name="robots" content="noindex,nofollow"><meta name="viewport" content="width=device-width,initial-scale=1"><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji}body{display:flex;flex-direction:column;height:100vh;min-height:100vh}.main-content{margin:8rem auto;max-width:60rem;padding-left:1.5rem}@media (width <= 720px){.main-content{margin-top:4rem}}.h2{font-size:1.5rem;font-weight:500;line-height:2.25rem}@media (width <= 720px){.h2{font-size:1.25rem;line-height:1.5rem}}#challenge-error-text{background-image:url(data:image/svg+xml;base64,PHN2ZyB4bWxucz0