In [13]:
import pandas as pd
import os
import shutil

In [None]:
def onlyIncludeOneResultPerSplit_old(df):
    filtered_df = pd.DataFrame(columns=df.columns)

    for year in range(2012, 2024):
        #look for spring split
         
        #returns all results from the year
        df_filter = df[df.iloc[:, 1].str.contains(str(year))]

        #returns anything that is in the spring
        search_terms = ['spring']
        df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]

        if df_filter.shape[0] > 1:
            exclude_terms = ['summer']
            df_filter = df_filter[~df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in exclude_terms))]

        #if there is only 1 result, most likely the team did not make playoffs so their placement is their final placement
        if df_filter.shape[0] == 1:
            df_filter.iloc[0, 1] = str(year) + ' spring'
            filtered_df = pd.concat([filtered_df, df_filter])

        #if there are 2 result, their playoff placement is their final
        elif df_filter.shape[0] == 2:
            search_terms = ['playoffs']
            df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]
            df_filter.iloc[0, 1] = str(year) + ' spring'
            filtered_df = pd.concat([filtered_df, df_filter])

        elif df_filter.shape[0] >= 3:
            search_terms = ['finals']
            df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]
            df_filter.iloc[0, 1] = str(year) + ' spring'
            filtered_df = pd.concat([filtered_df, df_filter])


        #look for summer split:
        df_filter = df[df.iloc[:, 1].str.contains(str(year))]

        search_terms = ['summer', 'championship', 'finals']
        df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]

        if df_filter.shape[0] > 1:
            exclude_terms = ['spring']
            df_filter = df_filter[~df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in exclude_terms))]

        if df_filter.shape[0] == 1:
            df_filter.iloc[0, 1] = str(year) + ' summer'
            filtered_df = pd.concat([filtered_df, df_filter])

        elif df_filter.shape[0] == 2:
            search_terms = ['playoffs', 'finals', 'championship']
            df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]
            df_filter.iloc[0, 1] = str(year) + ' summer'
            filtered_df = pd.concat([filtered_df, df_filter])
            
        elif df_filter.shape[0] >= 3:
            search_terms = ['finals']
            df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]
            df_filter.iloc[0, 1] = str(year) + ' summer'
            filtered_df = pd.concat([filtered_df, df_filter])

    return filtered_df.reset_index(drop=True)

In [None]:
def onlyIncludeOneResultPerSplit(df):
    filtered_df = pd.DataFrame(columns=df.columns)

    for year in range(2012, 2024):
        #look for spring split
         
        #returns all results from the year
        df_filter = df[df.iloc[:, 1].str.contains(str(year))]

        #returns anything that is in the spring
        search_terms = ['spring']
        df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]

        if df_filter.shape[0] >= 1:
            df_filter.iloc[0, 1] = str(year) + ' spring'
            filtered_df = pd.concat([filtered_df, df_filter.iloc[[0]]])


        #look for summer split:
        df_filter = df[df.iloc[:, 1].str.contains(str(year))]

        search_terms = ['summer', 'championship', 'finals']
        df_filter = df_filter[df_filter.iloc[:, 1].apply(lambda x: any(term in x for term in search_terms))]

        if df_filter.shape[0] >= 1:
            df_filter.iloc[0, 1] = str(year) + ' summer'
            filtered_df = pd.concat([filtered_df, df_filter.iloc[[0]]])


    return filtered_df.reset_index(drop=True)

def removeIrrelevantTournaments(df):
    search_terms = ['lcs', 'na', 'lec', 'eu lcs', 'champions', 'korea', 'china', 'lck', 'lpl']
    df = df[df.iloc[:, 1].apply(lambda x: any(term in x.split() for term in search_terms))]

    exclude_terms = ['cl', 'proving', 'expansion', 'academy', 'lock', 'showdown', 'promotion', 'qualifiers', 'qualifier', 'preseason', 'rift', 'worlds', 'iem', 'showmatch']
    df = df[~df.iloc[:, 1].apply(lambda x: any(term in x.split() for term in exclude_terms))]

    return df 


In [None]:
def checksConsecutive(df):
    #filters to see if they have at least 4 consecutive splits
    # Create a copy of df
    df_copy = df.copy()
    
    # Define mapping for seasons
    season_map = {'spring': 0, 'summer': 1}

    # Create a new column in df_copy with encoded year and season
    df_copy['year_season'] = df_copy.iloc[:, 1].apply(lambda x: (int(x.split()[0]) * 2 + season_map[x.split()[1]]))

    # Sort the DataFrame based on the encoded year and season
    df_sorted = df_copy.sort_values(by='year_season')

    # Now check for four consecutive numbers
    df_sorted['diff'] = df_sorted['year_season'].diff()

    # Find where difference is 1 (indicating consecutive year/season) and group these together
    df_sorted['group'] = (df_sorted['diff'] != 1).cumsum()

    # Check if any group has a size of 4 or more
    return any(df_sorted.groupby('group').size() >= 2)


def meetsCriteria(dir, player):
    #filters to see if they have at least 4 consecutive splits
    file_path = dir + '/' + player + '_tournament_results/' + player + '_tournament_results.csv'
    try:
        df = pd.read_csv(file_path, header=None)
    except pd.errors.EmptyDataError:
        df = pd.DataFrame()

    df = df.astype(str).applymap(str.lower)

    if not df.empty:
        df = removeIrrelevantTournaments(df)
    if not df.empty:
        df = onlyIncludeOneResultPerSplit(df)
    if df.empty:
        return df
    else:
        if checksConsecutive(df):
            return df
        else:
            return pd.DataFrame()
        

In [None]:
def extractPhotosdf(folder):
    pictures = os.listdir(folder)
    df = pd.DataFrame(pictures, columns=['File Name'])
    filtered_df = pd.DataFrame(columns=['Year Season', 'File Name'])

    for year in range(2012, 2024):
        # Filter for year
        df_filter = df[df['File Name'].str.contains(str(year))]

        # Filter for spring season
        search_terms = ['spring', 'split_1', 'split 1']
        df_spring = df_filter[df_filter['File Name'].apply(lambda x: any(term in x for term in search_terms))]

        if not df_spring.empty:
            spring_df = pd.DataFrame({'Year Season': [str(year) + ' spring'], 
                                      'File Name': [df_spring.iloc[0, 0]]})
            filtered_df = pd.concat([filtered_df, spring_df], ignore_index=True)
            
        # Filter for summer season
        search_terms = ['summer', 'split 2', 'split_2']
        df_summer = df_filter[df_filter['File Name'].apply(lambda x: any(term in x for term in search_terms))]

        if not df_summer.empty:
            summer_df = pd.DataFrame({'Year Season': [str(year) + ' summer'], 
                                      'File Name': [df_summer.iloc[0, 0]]})
            filtered_df = pd.concat([filtered_df, summer_df], ignore_index=True)
            
    return filtered_df




In [18]:
import pandas as pd
import os

def createDataSet(dir, output_csv_file):
    '''
    Inputs:
    dir - in format 'dataset'
    output_csv_file- in format 'test/data.csv'
    '''
    # Create an initial DataFrame with the desired columns.
    result_df = pd.DataFrame(columns=['Player', 'Season', 'Placement', 'Team'])
    
    for region in os.listdir(dir):
        subDir = os.path.join(dir, region)
        folders = os.listdir(subDir)
        folders = [folder for folder in folders if folder[-7:] != 'results']
        for player in folders:

            file_path = os.path.join(subDir, player + '_tournament_results', player + '_tournament_results.csv')
            
            df = pd.DataFrame()
            try:
                df = pd.read_csv(file_path, header=None)
            except pd.errors.EmptyDataError:
                pass
            
            df = df.astype(str).applymap(str.lower)
            df = removeIrrelevantTournaments(df) if not df.empty else df
            df = onlyIncludeOneResultPerSplit(df) if not df.empty else df
            
            if not df.empty:
                # Create a new DataFrame with 'Player' column and concatenate it with result_df.
                df.columns = ['Placement', 'Season', 'Team']
                df['Player'] = player
                result_df = pd.concat([result_df, df], ignore_index=True)

    # Write the result DataFrame to CSV file.
    result_df.to_csv(output_csv_file, index=False)

createDataSet('Dataset', 'processedDataset/players.csv')


In [14]:
def copyPhotos(df, photosLocation, outputFolder, playername):
    # Creating the output directory
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)

    # Adding a new column to store the new photo names
    df['New File Name'] = ""

    # Iterating over the DataFrame rows
    for index, row in df.iterrows():
        old_photo_name = row['File Name']
        new_photo_name = f"{playername}_{row[0]}.jpg"
        old_photo_path = os.path.join(photosLocation, old_photo_name)
        new_photo_path = os.path.join(outputFolder, new_photo_name)

        # Copying and renaming the photo
        shutil.copyfile(old_photo_path, new_photo_path)
        
        # Saving the new photo name to the DataFrame
        df.loc[index, 'New File Name'] = new_photo_name

    return df


def createPhotos(dir, output_folder):
    for region in os.listdir(dir):
        subDir = os.path.join(dir, region)
        folders = os.listdir(subDir)
        folders = [folder for folder in folders if folder[-7:] != 'results']
        for player in folders:

            photo_file_path = subDir + '/' + player           
            df = extractPhotosdf(photo_file_path)
            output_df = copyPhotos(df, photo_file_path, output_folder, player)
            df.to_csv('processedDataset/photos.csv', mode='a', index=False)

createPhotos('Dataset', 'processedDataset/photos')