In [25]:
import os
import pandas as pd

# Get all CSV files in the current directory
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]

# Loop through each CSV and print column names and first 2 rows
for csv in csv_files:
    print(f"### {csv}")
    # Read the first two rows of the CSV file
    df = pd.read_csv(csv, nrows=2)
    
    # Print column names
    columns = df.columns.tolist()
    print(f"| {'Column Name':<15} | {'Row 1':<10} | {'Row 2':<10} |")
    print(f"|{'-'*15}|{'-'*10}|{'-'*10}|")
    
    # Print the values from the first two rows for each column
    for column in columns:
        row1 = df[column].iloc[0]
        row2 = df[column].iloc[1]
        print(f"| {column:<15} | {row1:<10} | {row2:<10} |")
    
    print("\n" + "-"*40 + "\n")


### links.csv
| Column Name     | Row 1      | Row 2      |
|---------------|----------|----------|
| movieId         | 1          | 2          |
| imdbId          | 114709     | 113497     |
| tmdbId          | 862        | 8844       |

----------------------------------------

### movies.csv
| Column Name     | Row 1      | Row 2      |
|---------------|----------|----------|
| movieId         | 1          | 2          |
| title           | Toy Story (1995) | Jumanji (1995) |
| genres          | Adventure|Animation|Children|Comedy|Fantasy | Adventure|Children|Fantasy |

----------------------------------------

### ratings.csv
| Column Name     | Row 1      | Row 2      |
|---------------|----------|----------|
| userId          | 1          | 1          |
| movieId         | 2          | 29         |
| rating          | 3.5        | 3.5        |
| timestamp       | 1112486027 | 1112484676 |

----------------------------------------

### tags.csv
| Column Name     | Row 1      | Row

In [3]:
import pandas as pd
import random
import re

# Function to extract the year from the movie title
def extract_year(title):
    # Use regular expression to extract the year within parentheses, e.g., 'Toy Story (1995)' -> 1995
    match = re.search(r'\((\d{4})\)', title)
    return int(match.group(1)) if match else None

# Function to process datasets considering all data from 2010 onwards
def process_datasets(random_seed=42):
    random.seed(random_seed)
    
    # Load all CSVs into pandas DataFrames
    links_df = pd.read_csv('links.csv')
    movies_df = pd.read_csv('movies.csv')
    ratings_df = pd.read_csv('ratings.csv')
    tags_df = pd.read_csv('tags.csv')
    
    # Extract the release year from the 'title' column
    movies_df['year'] = movies_df['title'].apply(extract_year)
    
    # Filter movies released in 2010 or later
    movies_df = movies_df[movies_df['year'] >= 2005]
    
    # Keep only the movieIds from the filtered movies
    filtered_movie_ids = movies_df['movieId'].tolist()
    
    # Filter the DataFrames based on the movieIds from 2010 onwards
    filtered_links_df = links_df[links_df['movieId'].isin(filtered_movie_ids)]
    filtered_movies_df = movies_df
    filtered_ratings_df = ratings_df[ratings_df['movieId'].isin(filtered_movie_ids)]
    filtered_tags_df = tags_df[tags_df['movieId'].isin(filtered_movie_ids)]
    
    # Save the filtered datasets to new CSV files
    filtered_links_df.to_csv('filtered_links.csv', index=False)
    filtered_movies_df.to_csv('filtered_movies.csv', index=False)
    filtered_ratings_df.to_csv('filtered_ratings.csv', index=False)
    filtered_tags_df.to_csv('filtered_tags.csv', index=False)
    
    print("Filtered datasets saved as:")
    print("filtered_links.csv")
    print("filtered_movies.csv")
    print("filtered_ratings.csv")
    print("filtered_tags.csv")

# Call the function to process the datasets
process_datasets()  # No sample size, uses all data

Filtered datasets saved as:
filtered_links.csv
filtered_movies.csv
filtered_ratings.csv
filtered_tags.csv


In [4]:

import os

# List of original filenames and the filtered filenames
original_files = ['links.csv', 'movies.csv', 'ratings.csv', 'tags.csv']
filtered_files = ['filtered_links.csv', 'filtered_movies.csv', 'filtered_ratings.csv', 'filtered_tags.csv']

# Delete the original files
for original_file in original_files:
    if os.path.exists(original_file):
        os.remove(original_file)
        print(f"Deleted original file: {original_file}")
    else:
        print(f"Original file not found: {original_file}")

# Rename the filtered files to the original names
for original_file, filtered_file in zip(original_files, filtered_files):
    if os.path.exists(filtered_file):
        os.rename(filtered_file, original_file)
        print(f"Renamed {filtered_file} to {original_file}")
    else:
        print(f"Filtered file not found: {filtered_file}")



Deleted original file: links.csv
Deleted original file: movies.csv
Deleted original file: ratings.csv
Deleted original file: tags.csv
Renamed filtered_links.csv to links.csv
Renamed filtered_movies.csv to movies.csv
Renamed filtered_ratings.csv to ratings.csv
Renamed filtered_tags.csv to tags.csv
