In [None]:
!pip install --upgrade pip
!pip install praw
!pip install opencv-python-headless

In [None]:
import os
import requests
import pandas as pd
import numpy as np
import cv2
from sklearn.cluster import KMeans
from webcolors import hex_to_rgb, CSS3_HEX_TO_NAMES
import shutil
import os

In [None]:
new_directory = "/data/notebook_files/Reddit_Project"
os.chdir(new_directory)

In [None]:
from memes_scrapper import download_subreddit_images
from preprocessing import process_images
from preprocessing import copy_files_with_color_check
from preprocessing import copy_images
from preprocessing import get_image_formats_distribution

In [None]:


def setup_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)

def process_subreddits(subreddits, project_directory, process_directory):
    for subreddit in subreddits:
        # Download subreddit images
        download_subreddit_images(subreddit)
        
        # Process images and copy files with color check
        input_csv = os.path.join(project_directory, 'downloads', subreddit, f'{subreddit}_metadata.csv')
        output_csv = os.path.join(process_directory, f'{subreddit}2.csv')
        df = pd.read_csv(input_csv)
        process_images(df, output_csv)
        
        source_folder = os.path.join(project_directory, 'downloads', subreddit)
        destination_folder = os.path.join(process_directory, f'{subreddit}_fil')
        copy_files_with_color_check(output_csv, source_folder, destination_folder)

In [None]:

project_directory = "/data/notebook_files/Reddit_Project"
os.chdir(project_directory)

process_directory = '/data/notebook_files/Further_Process'
setup_directory(process_directory)

subreddits = ['wholesomememes', 'dankmemes']
#Download memes from a subreddit and store them in their respective subfolders
# within the downloads directory. If the directory does not exist, create it. 
process_subreddits(subreddits, project_directory, process_directory)

downloads_directory = "/data/notebook_files/Reddit_Project/downloads"

In [None]:

subfolders = [f.path for f in os.scandir(downloads_directory) if f.is_dir()]
#Number of images in respective subfolders within the downloads directory.
for subfolder in subfolders:
        num_images = len([f for f in os.listdir(subfolder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        print(f"Folder: {subfolder}, Number of Images: {num_images}")


new_folder = '/data/notebook_files/Further_Process/FinalDataset/Dataset'
setup_directory(new_folder)
copy_images(process_directory, new_folder)
print("Images copied successfully to the new folder.")
subfolders = [f.path for f in os.scandir(process_directory) if f.is_dir()]
for subfolder in subfolders:
    num_images = len([f for f in os.listdir(subfolder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
    print(f"Folder: {subfolder}, Number of Images: {num_images}")

distribution = get_image_formats_distribution(new_folder)
print("Image Format Distribution:")
for format, count in distribution.items():
    print(f"{format}: {count} images")

dfs = {name: pd.read_csv(f'/data/notebook_files/Further_Process/{name}2.csv') for name in subreddits}

stacked_df = pd.concat(dfs.values(), ignore_index=True)

In [None]:
df_filtered2 = stacked_df[stacked_df['Filename'].apply(lambda x: os.path.exists(os.path.join(new_folder, x)))]

# Print the resulting DataFrame
len(df_filtered2)
df_filtered2 = df_filtered2.drop(['Dominant Colors', 'Color Names'], axis=1)
df_no_duplicates = df_filtered2.drop_duplicates(subset='Filename', keep='first')

df_no_duplicates.to_csv('memes_metadata.csv', index=False)