<a href="https://colab.research.google.com/github/winterForestStump/application_project/blob/main/script_for_extraction_ver2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script for pictures extraction and manipulation

### 1. Counting number of zip folders inside the folders for each gate:

Since each zip folder represents a vehicle, whick went through the gate, we can estimate the number of pictures

In [None]:
import os

def count_zip_folders(path):
    ''' Function for counting number of zip folders'''
    # Count the number of zip folders in the given path
    zip_folder_count = 0
    for subfolder_name in os.listdir(path):
        subfolder_path = os.path.join(path, subfolder_name)
        # Ignore any files that are not folders
        if not os.path.isdir(subfolder_path):
            continue
        # Iterate through each zip file in the subfolder
        for zip_file_name in os.listdir(subfolder_path):
            # Ignore any files that are not zip files
            if not zip_file_name.endswith('.zip'):
                continue
            # Increment the zip folder count
            zip_folder_count += 1
    # Return the number of zip folders
    return zip_folder_count

In [None]:
path = 'D:\\ICG01'
zip_folder_count = count_zip_folders(path)
print(f"Number of zip folders in '{path}': {zip_folder_count}")

Number of zip folders in 'D:\ICG01': 174853


In [None]:
path = 'D:\\ICG02'
zip_folder_count = count_zip_folders(path)
print(f"Number of zip folders in '{path}': {zip_folder_count}")

Number of zip folders in 'D:\ICG02': 115552


In [None]:
path = 'D:\\OCG01'
zip_folder_count = count_zip_folders(path)
print(f"Number of zip folders in '{path}': {zip_folder_count}")

Number of zip folders in 'D:\OCG01': 162422


In [None]:
path = 'D:\\OCG02'
zip_folder_count = count_zip_folders(path)
print(f"Number of zip folders in '{path}': {zip_folder_count}")

Number of zip folders in 'D:\OCG02': 121221


At all we have 574 048 zip folders. Each represents the vehicle

Assuming that each zip folder contains about 40 photos, the total amount of data obtained is about 2.3 mln pictures

### 2. Pictures extraction:

Inside each zip folder there are photos of the vihecles from different angles and also some cutted pictures for number plates recognition. For future labeling we decided to use photos only from the back of the trucks.

In [None]:
import os
import shutil
import zipfile

def extract_pictures_from_zips(path, picture_names):
    ''' Function for extracting pictures from the zip folders into the newly created folder 'extracted'. Photo name is an input '''
    # Create a new folder named "extracted" if it doesn't exist already
    extracted_folder_path = os.path.join(path, 'extracted')
    os.makedirs(extracted_folder_path, exist_ok=True)
    # Initialize list to keep track of bad zip files
    bad_zip_files = []
    # Iterate through each subfolder in the 'ICG01' folder
    for subfolder_name in os.listdir(path):
        subfolder_path = os.path.join(path, subfolder_name)
        # Ignore any files that are not folders
        if not os.path.isdir(subfolder_path):
            continue
        # Iterate through each zip file in the subfolder
        for zip_file_name in os.listdir(subfolder_path):
            # Ignore any files that are not zip files
            if not zip_file_name.endswith('.zip'):
                continue
            try:
                # Extract all files from the zip file to the 'extracted' folder
                with zipfile.ZipFile(os.path.join(subfolder_path, zip_file_name), 'r') as zip_file:
                    for file_name in zip_file.namelist():
                        # Check if the file is one of the pictures you need
                        if file_name in picture_names:
                            # Extract the file to the 'extracted' folder with a new name
                            new_file_name = os.path.splitext(zip_file_name)[0] + '_' + file_name
                            new_file_path = os.path.join(extracted_folder_path, new_file_name)
                            with zip_file.open(file_name) as zip_file_contents, open(new_file_path, 'wb') as new_file:
                                shutil.copyfileobj(zip_file_contents, new_file)
            except zipfile.BadZipFile:
                bad_zip_files.append(os.path.join(subfolder_name, zip_file_name))
                continue
    # Display bad zip files, if any
    if bad_zip_files:
        print("Warning: The following zip files were not able to be unzipped:")
        for file in bad_zip_files:
            print(file)

Photos what we need have the name "snapshot-chassis0-lsback-full.jpg"

In [None]:
path = 'D:\\ICG02'
picture_names = ['snapshot-chassis0-lsback-full.jpg']
extract_pictures_from_zips(path, picture_names)

20220212\LHG--ICG02-20220212-190956-736868.zip
20220212\LHG--ICG02-20220212-190637-736867.zip
20220802\LHG--ICG02-20220802-140806-793736.zip


In [None]:
path = 'D:\\ICG01'
picture_names = ['snapshot-chassis0-lsback-full.jpg']
extract_pictures_from_zips(path, picture_names)

20220214\LHG--ICG01-20220214-202945-958525.zip
20220908\LHG--ICG01-20220908-113025-1059166.zip


In [None]:
path = 'D:\\OCG01'
picture_names = ['snapshot-chassis0-lsback-full.jpg']
extract_pictures_from_zips(path, picture_names)

20220519\LHG--OCG01-20220519-111715-877941.zip
20220907\LHG--OCG01-20220907-064128-925695.zip


Total time for code running - 1271 min (22 hours) :

* 'ICG01' - 445 min,
* 'ICG02' - 145 min,
* 'OCG01' - 383 min,
* 'OCG02' - 298 min

Some of the zip folders (7 in total) had issues and were not be able to open

### 3. Clean the datd: finding zip folder names with only photos of truck inside

Number of pictures is about 500K, which is a lot. We need somehow reduce them. One of the ways to filter them out is to extract pictures of only trucks. The zip folders with trucks pictures contain inside the photos with the names 'snapshot-chassis0-lsback-ocr-dcocr1.jpg' (for ICG folders) and 'snapshot-chassis0-lsback-ocr-cocr0.jpg' (for OCG folders)

In [None]:
import os
import zipfile

def get_zip_folder_names_with_picture(path):
    """ Picks the names of zip folders which consist picture with the specific name inside """
    # Set the name of the picture you need
    picture_name = 'snapshot-chassis0-lsback-ocr-dcocr1.jpg'
    # Create an empty list to hold zip folder names that contain the specified picture file
    zip_folder_names = []
    # Iterate through each 'YYYYMMDD' folder in the path
    for yyyymmdd_folder_name in os.listdir(path):
        yyyymmdd_folder_path = os.path.join(path, yyyymmdd_folder_name)
        # Ignore any files that are not folders
        if not os.path.isdir(yyyymmdd_folder_path):
            continue
        # Iterate through each zip file in the 'YYYYMMDD' folder
        for zip_file_name in os.listdir(yyyymmdd_folder_path):
        # Ignore any files that are not zip files
            if not zip_file_name.endswith('.zip'):
                continue
            # Check if the zip file contains the specified picture file
            try:
                with zipfile.ZipFile(os.path.join(yyyymmdd_folder_path, zip_file_name), 'r') as zip_file:
                    if picture_name in zip_file.namelist():
                        # If we reach this point, it means the zip file contains the specified picture file
                        # Add the zip folder name to the list
                        zip_folder_names.append(zip_file_name.split('.')[0])
            except zipfile.BadZipFile:
                continue
    # Display bad zip files, if any
    bad_zip_files = []
    if bad_zip_files:
        print("Warning: The following zip files were not able to be unzipped:")
        for zip_file in bad_zip_files:
            print(zip_file)
    # Return the list of zip folder names that contain the specified picture file
    return zip_folder_names

For the "ICG02" folder:

In [None]:
path = 'D:\\ICG02'
zip_folder_names = get_zip_folder_names_with_picture(path)
print(len(zip_folder_names))

72961


Save the names of the pictures in the txt file

In [None]:
my_list_ICG02 = zip_folder_names
# Define the file name
file_name = "ICG02_list.txt"
# Open the file in write mode
with open(file_name, mode='w') as file:
    # Write each string from the list as a new line in the text file
    for string in my_list_ICG02:
        file.write(string + "\n")


For the "OCG01" folder

Inside zip folders for "OCG01" gate there are no pictures with the name 'snapshot-chassis0-lsback-ocr-dcocr1.jpg'. Instead there is a pattern: if the picture is a photo of a truck than there is a also a picture with the name 'snapshot-chassis0-lsback-ocr-cocr0.jpg'. We can use that

In [None]:
import os
import zipfile

def get_zip_folder_names_with_picture_ver2(path):
    """ Picks the names of zip folders which consist picture with the specific name inside """
    # Set the name of the picture you need
    picture_name = 'snapshot-chassis0-lsback-ocr-cocr0.jpg'
    # Create an empty list to hold zip folder names that contain the specified picture file
    zip_folder_names = []
    # Iterate through each 'YYYYMMDD' folder in the path
    for yyyymmdd_folder_name in os.listdir(path):
        yyyymmdd_folder_path = os.path.join(path, yyyymmdd_folder_name)
        # Ignore any files that are not folders
        if not os.path.isdir(yyyymmdd_folder_path):
            continue
        # Iterate through each zip file in the 'YYYYMMDD' folder
        for zip_file_name in os.listdir(yyyymmdd_folder_path):
            # Ignore any files that are not zip files
            if not zip_file_name.endswith('.zip'):
                continue
            # Check if the zip file contains the specified picture file
            try:
                with zipfile.ZipFile(os.path.join(yyyymmdd_folder_path, zip_file_name), 'r') as zip_file:
                    if picture_name in zip_file.namelist():
                        # If we reach this point, it means the zip file contains the specified picture file
                        # Add the zip folder name to the list
                        zip_folder_names.append(zip_file_name.split('.')[0])
            except zipfile.BadZipFile:
                continue
    # Display bad zip files, if any
    bad_zip_files = []
    if bad_zip_files:
        print("Warning: The following zip files were not able to be unzipped:")
        for zip_file in bad_zip_files:
            print(zip_file)
    # Return the list of zip folder names that contain the specified picture file
    return zip_folder_names

In [None]:
path = 'D:\\OCG01'
zip_folder_names = get_zip_folder_names_with_picture_ver2(path)
print(len(zip_folder_names))

96877


Save the names of the pictures in the txt file

In [None]:
my_list_OCG01 = zip_folder_names
# Define the file name
file_name = "OCG01_list.txt"
# Open the file in write mode
with open(file_name, mode='w') as file:
    # Write each string from the list as a new line in the text file
    for string in my_list_OCG01:
        file.write(string + "\n")

For the "OCG02" folder

In [None]:
path = 'D:\\OCG02'
zip_folder_names = get_zip_folder_names_with_picture_ver2(path)
print(len(zip_folder_names))

67628


Save the names of the pictures in the txt file

In [None]:
my_list_OCG02 = zip_folder_names
# Define the file name
file_name = "OCG02_list.txt"
# Open the file in write mode
with open(file_name, mode='w') as file:
    # Write each string from the list as a new line in the text file
    for string in my_list_OCG02:
        file.write(string + "\n")

For the "ICG01" folder:

In [None]:
path = 'D:\\ICG01'
zip_folder_names = get_zip_folder_names_with_picture(path)
print(len(zip_folder_names))

101444


Save the names of the pictures in the txt file

In [None]:
my_list_ICG01 = zip_folder_names
# Define the file name
file_name = "ICG01_list.txt"
# Open the file in write mode
with open(file_name, mode='w') as file:
    # Write each string from the list as a new line in the text file
    for string in my_list_ICG01:
        file.write(string + "\n")

In [None]:
print(f' Total amount of photos are {72961+96877+67628+101444}')

 Total amount of photos are 338910


Overall amount of truck images are 338 910. We will work with them

### 4. Concatenate all exctracted pictures inside one folder:

In [None]:
# VERY SLOW
import os
import shutil

# Set the source paths
source_paths = [
    'D:\\ICG01\\extracted',
    'D:\\ICG02\\extracted',
    'D:\\OCG01\\extracted',
    'D:\\OCG02\\extracted']
# Set the destination path
destination_path = 'D:\\Extracted_ALL'
# Create the destination folder if it doesn't exist
os.makedirs(destination_path, exist_ok=True)
# Move all the image files from the source folders to the destination folder
for source_path in source_paths:
    for root, dirs, files in os.walk(source_path):
        for file in files:
            if file.endswith('.jpg'):
                file_path = os.path.join(root, file)
                shutil.move(file_path, destination_path)

### 5. Matching extracted pictures names with TXT file

For labeling we need only pictures with trucks. So we need to match extracted pictures with the TXT names to take them to label

In [None]:
import os
import shutil

# Define the path to the folder containing the pictures
pictures_folder = "D:\\OCG01\\extracted"
# Define the path to the text file containing the names to match
text_file = "OCG01_list.txt"
# Define the name of the new folder to copy the matching pictures to
new_folder = "D:\\OCG01\\extracted\\TRUCKS"
# Create the new folder if it doesn't exist
if not os.path.exists(new_folder):
    os.mkdir(new_folder)
# Read the first 100 lines of the text file
with open(text_file, "r") as f:
    lines = f.readlines()[50000:72962] #extracting by batches
# Iterate over the lines and try to find matching picture files
for line in lines:
    name_to_match = line.strip()  # remove any trailing newline or whitespace characters
    for filename in os.listdir(pictures_folder):
        if name_to_match in filename:
            # If a matching picture file is found, copy it to the new folder
            shutil.copy2(os.path.join(pictures_folder, filename), os.path.join(new_folder, filename))

### 6.  Find pictures from excel file

We received Excel file with some more information. We tryed to get info about the trucks and signs from the excel file. But the results we received were not usefull: failed to match the information from the received photos with the data from the file. Also the data in the file is not accurate

In [None]:
import os
import pandas as pd
Excel_dates_and_clases = pd.read_csv('Excel_dates_and_clases.csv')
# define the directory where the JPG files are located
jpg_dir = 'path_to_directory\TRUCKS_TO_LABEL_p1\\1'
# list all the files in the directory
files = os.listdir(jpg_dir)
# filter the JPG files based on the values in the 'amgate_d' column
jpg_files = []
for f in files:
    if f.endswith('.jpg') and 'LHG--ICG02-' in f:
        # extract the value we're searching for from the filename
        value = f.split('LHG--ICG02-')[1][:13]
        if any(val in value for val in Excel_dates_and_clases['amgate_d']):
            jpg_files.append(f)
# save the matched JPG file names to a text file
with open('matched_jpg_files.txt', 'w') as file:
    for f in jpg_files:
        file.write(f + '\n')

### 7. Copy pictures which have the same name as TXT files with labels

For the first model training try we need to extract pictures with the same names as from the TXT file

In [None]:
import os
import shutil

src_folder = "E:/ICG02/extracted"
dst_folder = "E:/1stTRY"
labels_folder = "E:/labels"
ignore_folder = "TRUCKS"
# Iterate over files in labels folder
for root, _, files in os.walk(labels_folder):
    # Iterate over files in current folder
    for file_name in files:
        # Construct full path to label file and picture file
        label_file = os.path.join(root, file_name)
        picture_file = os.path.join(src_folder, os.path.splitext(file_name)[0] + ".jpg")
        # Check if picture file exists and is not in the "TRUCKS" folder
        if os.path.isfile(picture_file) and ignore_folder not in picture_file:
            # Construct full path to destination file
            dst_file = os.path.join(dst_folder, os.path.basename(picture_file))
            # Copy file to destination folder
            shutil.copy(picture_file, dst_file)