In [2]:
# Cleaning UserAchievements.csv file
import pandas as pd
import os
import numpy as np

# Load the UserAchievements.csv file
df = pd.read_csv('/kaggle/input/meta-kaggle/UserAchievements.csv')
print("Reading completed..\n")

# Filter out rows where Points, TotalGold, and TotalSilver are all zero
df_filtered = df[~((df['Points'] == 0) & (df['TotalGold'] == 0) & (df['TotalSilver'] == 0))]
print("Filtering completed..\n")

# Save the filtered DataFrame to a new CSV file in the Kaggle working directory
df_filtered.to_csv('/kaggle/working/UserAchievements_Cleaned.csv', index=False)
print("Saving to --> /kaggle/working/UserAchievements_Cleaned.csv")

Reading completed..

Filtering completed..

Saving to --> /kaggle/working/UserAchievements_Cleaned.csv


In [None]:
import pandas as pd
import os
import numpy as np

# Define the directory where your dataset is located
dataset_directory = '/kaggle/input/meta-kaggle/'  # Update this path to your specific dataset directory

# Output directory for Excel files
output_directory = '/kaggle/working/'  # This is a common output directory in Kaggle kernels

# List of specific filenames you want to process
filenames_to_process = [
    "UserAchievements_Cleaned.csv",
    "Submissions.csv",
    "Users.csv",
    "ForumMessages.csv",
    "Teams.csv",
    "UserFollowers.csv",
    "ForumMessageVotes.csv",
    "ForumTopics.csv",
    "KernelTags.csv",
    "Datasets.csv",
    "DatasetVersions.csv",
    "DatasetTags.csv",
    "Forums.csv",
    "Competitions.csv",
    "DatasetTaskSubmissions.csv",
    "DatasetTasks.csv",
    "UserOrganizations.csv",
    "Tags.csv",
    "Organizations.csv",
    "CompetitionTags.csv",
    "KernelLanguages.csv"
]
#tables = []
metadata = []

for filename in filenames_to_process:
    print(f"Processing {filename}...")
    # Special case for the cleaned UserAchievements.csv since it's in a different directory.
    if filename == "UserAchievements_Cleaned.csv":
        df = pd.read_csv(os.path.join(output_directory, filename))
    else:
        df = pd.read_csv(os.path.join(dataset_directory, filename))
    table_name = filename.split('.')[0]

    # Iterate through each column to gather metadata
    for column in df.columns:
        col_data = df[column]
        meta = {
            'table_name': table_name,
            'column_name': column,
            'data_type': col_data.dtype,
            'non_null_count': col_data.notnull().sum(),
            'unique_count': col_data.nunique()
        }

        # Additional statistics for numerical columns
        if pd.api.types.is_numeric_dtype(col_data):
            meta['min'] = col_data.min()
            meta['max'] = col_data.max()
            meta['mean'] = col_data.mean()
            meta['median'] = col_data.median()
            meta['std_dev'] = col_data.std()

        # Additional information for categorical columns
        if pd.api.types.is_categorical_dtype(col_data) or pd.api.types.is_object_dtype(col_data):
            top_categories = col_data.value_counts().nlargest(5).index.tolist()
            meta['top_categories'] = top_categories

        metadata.append(meta)

    #tables.append(df)
    print(f"Completed processing {filename}.\n")

# Concatenate all tables horizontally (you can change axis to 0 for vertical concatenation)
#result = pd.concat(tables, axis=1)

# Convert metadata list to a DataFrame for easier viewing and analysis
metadata_df = pd.DataFrame(metadata)

# Save the concatenated data and metadata to Excel files
#result_file_path = os.path.join(output_directory, 'concatenated_data.xlsx')
metadata_file_path = os.path.join(output_directory, 'metadata.xlsx')

#print("Saving concatenated data to Excel...")
#result.to_excel(result_file_path, index=False)

print("Saving metadata to Excel...")
metadata_df.to_excel(metadata_file_path, index=False)

print(f"Metadata file created successfully at : {metadata_file_path}\n")


In [6]:
# Cleaning Submissions.csv file
import pandas as pd

file_path = '/kaggle/input/meta-kaggle/Submissions.csv'
output_path = '/kaggle/working/Submissions_Cleaned.csv'

# Load the CSV file with only the specified columns
columns = ['Id', 'SubmittedUserId', 'TeamId', 'PublicScoreLeaderboardDisplay', 'SubmissionDate', 'IsAfterDeadline']
df = pd.read_csv(file_path, usecols=columns)

# Convert 'SubmissionDate' to datetime format to ensure correct sorting
df['SubmissionDate'] = pd.to_datetime(df['SubmissionDate'])

# Sort by 'SubmittedUserId' and 'SubmissionDate' to get the latest submission by each user
df_sorted = df.sort_values(by=['SubmittedUserId', 'SubmissionDate'], ascending=[True, False])

# Drop duplicate 'SubmittedUserId', keeping the first (latest submission based on 'SubmissionDate')
df_cleaned = df_sorted.drop_duplicates(subset='SubmittedUserId', keep='first')

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv(output_path, index=False)

print(f'Cleaned file saved to: {output_path}')

Cleaned file saved to: /kaggle/working/Submissions_Cleaned.csv
