In [11]:
import os
import pandas as pd
import shutil

def extract_filenames_from_folder(folder_path):
    """
    Extracts all filenames from the given folder.

    Parameters:
    folder_path (str): Path to the folder.

    Returns:
    list: List of filenames in the folder.
    """
    filenames = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return filenames

def extract_filenames_from_csv(csv_path, column_name):
    """
    Extracts filenames from the specified column in a CSV file.

    Parameters:
    csv_path (str): Path to the CSV file.
    column_name (str): Name of the column containing filenames.

    Returns:
    list: List of filenames from the CSV column.
    """
    df = pd.read_csv(csv_path)
    return df[column_name].tolist()

def compare_filenames(folder_filenames, csv_filenames):
    """
    Compares filenames from the folder and CSV, identifying those not in the CSV.

    Parameters:
    folder_filenames (list): List of filenames from the folder.
    csv_filenames (list): List of filenames from the CSV.

    Returns:
    list: List of filenames present in the folder but not in the CSV.
    """
    return [f for f in folder_filenames if f not in csv_filenames]

def copy_files_to_new_directory(file_list, source_folder, destination_folder):
    """
    Copies specified files from the source folder to the destination folder.

    Parameters:
    file_list (list): List of filenames to copy.
    source_folder (str): Path to the source folder.
    destination_folder (str): Path to the destination folder.
    """
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    for file_name in file_list:
        shutil.copy(os.path.join(source_folder, file_name), os.path.join(destination_folder, file_name))

def main(folder_path, csv_path, csv_column_name, new_directory):
    """
    Main function to extract, compare, and copy files.

    Parameters:
    folder_path (str): Path to the folder containing files.
    csv_path (str): Path to the CSV file.
    csv_column_name (str): Name of the column in the CSV containing filenames.
    new_directory (str): Path to the new directory where unmatched files will be copied.
    """
    folder_filenames = extract_filenames_from_folder(folder_path)
    csv_filenames = extract_filenames_from_csv(csv_path, csv_column_name)
    files_to_copy = compare_filenames(folder_filenames, csv_filenames)
    copy_files_to_new_directory(files_to_copy, folder_path, new_directory)
    print(f"Copied {len(files_to_copy)} files to {new_directory}")

# Example usage:
folder_path = '/Users/njlalwani/Documents/GitHub/Ecomm-data/photo_scripts/shopify_images'
csv_path = '/Users/njlalwani/Documents/GitHub/Ecomm-data/Descriptions_scripts/product_info_final.csv'
csv_column_name = 'Filename'  # Change to your column name in CSV
new_directory = 'missing_files'

main(folder_path, csv_path, csv_column_name, new_directory)


Copied 482 files to missing_files


In [17]:
import os
import re

# Define the directory path
directory_path ='/Users/njlalwani/Documents/GitHub/Ecomm-data/photo_scripts/shopify_images'

# Initialize counters and a set for unique codes
num_count = 0
unique_codes = set()

# Define the regex pattern for matching filenames
pattern = re.compile(r'^(?P<code>[^_]+)_[^_]+_(?P<num>\d+)\.[a-z]+$')

# Iterate over files in the directory
for filename in os.listdir(directory_path):
    match = pattern.match(filename)
    if match:
        code = match.group('code')
        num = int(match.group('num'))
        if num == 1:
            num_count += 1
        unique_codes.add(code)

# Output the results
print(f"Number of unique code color combinations : {num_count}")
print(f"Number of unique codes: {len(unique_codes)}")


Number of unique code color combinations : 355
Number of unique codes: 322


In [22]:
import pandas as pd

# Define the path to your CSV file
csv_file_path = '/Users/njlalwani/Downloads/product_info_final.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Get the number of unique items in the 'Code' column
unique_codes_count = df['Code'].nunique()

# Output the result
print(f"Number of unique items in the 'Code' column: {unique_codes_count}")


Number of unique items in the 'Code' column: 321


In [8]:
import pandas as pd
import shutil
import os

# Define paths
csv_file_path = '/Users/njlalwani/Desktop/Issues.csv'  # Path to the CSV file
source_directory = '/Users/njlalwani/Documents/GitHub/Ecomm-data/photo_scripts/shopify_images'  # Path to the source directory
destination_directory = 'issues_description'  # Path to the destination directory

# Create the destination directory if it doesn't exist
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Initialize a counter for the number of files copied
files_copied_count = 0

# Assuming the column with filenames is named 'Filename'
for filename in df['Filename']:
    src_file = os.path.join(source_directory, filename)
    dest_file = os.path.join(destination_directory, filename)
    
    # Copy file
    if os.path.exists(src_file):
        shutil.copy(src_file, dest_file)
        files_copied_count += 1
        print(f"Copied: {filename}")
    else:
        print(f"File not found: {filename}")

# Output the result
print(f"Number of files copied: {files_copied_count}")


Copied: B8808_White_1.png
Copied: E2088_Pink_1.png
Copied: E2360_Purple-Print_1.jpg
Number of files copied: 3
