## Step 1: Data Collection & Preparation

In [4]:
import pandas as pd

# 1. SETUP
filename = 'trending_yt_videos_113_countries.csv'
chunk_size = 50000 
music_pieces = []

# Keywords that suggest a video is music
# We will check if the title OR tags contain these words
music_keywords = ['music', 'official video', 'lyrics', 'song', 'vevo', 'audio', 'records']

print("Starting to process the giant file using Keyword Filtering...")

try:
    # 2. PROCESS IN CHUNKS
    for chunk in pd.read_csv(filename, chunksize=chunk_size, on_bad_lines='skip'):
        
        # Make sure we don't crash on empty descriptions/tags
        chunk['title'] = chunk['title'].fillna('').astype(str)
        chunk['video_tags'] = chunk['video_tags'].fillna('').astype(str)
        
        # Create a "pattern" to search for (e.g., "music|lyrics|song")
        pattern = '|'.join(music_keywords)
        
        # FILTER: Keep row IF (Title has keyword) OR (Tags has keyword)
        # case=False means "Music" and "music" both count
        is_music = (
            chunk['title'].str.contains(pattern, case=False) | 
            chunk['video_tags'].str.contains(pattern, case=False)
        )
        
        music_only = chunk[is_music].copy()
        
        music_pieces.append(music_only)

    # 3. COMBINE & SAVE
    if music_pieces:
        music_df = pd.concat(music_pieces, ignore_index=True)
        print("Success!")
        print(f"Found {len(music_df)} potential music videos.")
        display(music_df.head(3))
        
        # Save the smaller file
        music_df.to_csv('my_music_data_filtered.csv', index=False)
        print("Saved to 'my_music_data_filtered.csv'.")
    else:
        print("No music videos found. Try adding more keywords.")

except Exception as e:
    print(f"An error occurred: {e}")

Starting to process the giant file using Keyword Filtering...
An error occurred: Error tokenizing data. C error: EOF inside string starting at row 2547993


In [17]:
import csv

# --- CONFIGURATION ---
input_filename = 'trending_yt_videos_113_countries.csv' # Your giant file
output_filename = 'music_data_final.csv'                # The new safe file

# Keywords to find music
music_keywords = ['music', 'official video', 'lyrics', 'song', 'vevo', 'audio', 'records']

print("Starting repair... extracting music and removing dangerous descriptions.")

total_rows = 0
music_rows = 0

# Open files
with open(input_filename, 'r', encoding='utf-8', errors='replace') as f_in, \
     open(output_filename, 'w', encoding='utf-8', newline='') as f_out:
    
    reader = csv.DictReader(f_in)
    
    # REMOVE 'description' FROM HEADERS
    # We take all existing column names EXCEPT 'description'
    safe_headers = [h for h in reader.fieldnames if h != 'description']
    
    writer = csv.DictWriter(f_out, fieldnames=safe_headers)
    writer.writeheader()
    
    while True:
        try:
            row = next(reader)
            total_rows += 1
            
            # Check if it is music
            content_check = (str(row.get('title', '')) + " " + str(row.get('video_tags', ''))).lower()
            
            if any(keyword in content_check for keyword in music_keywords):
                
                # DELETE DESCRIPTION DATA
                if 'description' in row:
                    del row['description']
                
                # Write the clean row
                writer.writerow(row)
                music_rows += 1
                
            if total_rows % 500000 == 0:
                print(f"Scanned {total_rows} rows... Found {music_rows} music videos.")

        except StopIteration:
            break
        except Exception:
            continue # Skip bad lines

print(f"\nSUCCESS! Saved {music_rows} music videos to '{output_filename}'.")

Starting repair... extracting music and removing dangerous descriptions.
Scanned 500000 rows... Found 126144 music videos.
Scanned 1000000 rows... Found 202887 music videos.
Scanned 1500000 rows... Found 245854 music videos.
Scanned 2000000 rows... Found 289545 music videos.
Scanned 2500000 rows... Found 332711 music videos.

SUCCESS! Saved 336135 music videos to 'music_data_final.csv'.


In [18]:
import pandas as pd

# Load the new final file
df = pd.read_csv('music_data_final.csv')

print("Data Loaded Successfully!")
display(df.head())

Data Loaded Successfully!


Unnamed: 0,title,channel_name,daily_rank,daily_movement,weekly_movement,snapshot_date,country,view_count,like_count,comment_count,thumbnail_url,video_id,channel_id,video_tags,kind,publish_date,langauge
0,Raymer - Kapoto (Official Music Video),Raymer,1,0,0,2025-11-29,ZW,460391,9516,895,https://i.ytimg.com/vi/pA8EAJit4MI/mqdefault.jpg,pA8EAJit4MI,UCoG7RTHJTxu89ZO-ep4h4ZQ,,youtube#video,2025-11-19 00:00:00+00:00,und
1,Mambo Dhuterere X @FreemanHKDBOSS - Nhovo Vis...,MAMBO DHUTERERE,4,0,46,2025-11-29,ZW,178648,8147,951,https://i.ytimg.com/vi/MG7o5ShlW1s/mqdefault.jpg,MG7o5ShlW1s,UCIrBf4UuflPNKXtFvcGVY2w,"Mambo, Dhuterere, mambo dhuterere, jah prayzah...",youtube#video,2025-11-24 00:00:00+00:00,und
2,Bling4 - PaHarare (Official Music Video),Bling4,7,-1,-3,2025-11-29,ZW,375283,14225,1585,https://i.ytimg.com/vi/XG2JlbZwmKs/mqdefault.jpg,XG2JlbZwmKs,UCjjWq2vfnelJYaIo6KnGWjg,,youtube#video,2025-11-13 00:00:00+00:00,en
3,"Killer T, Xiba - Bhiya (Official Video)",KillerTVEVO,11,-3,-4,2025-11-29,ZW,925765,18012,1195,https://i.ytimg.com/vi/VPswb0oVsKU/mqdefault.jpg,VPswb0oVsKU,UC705i_Y5Nv0PRKT7Xf4UcIA,"Xiba, JungleEnt, Afro Beat, World, World Beat,...",youtube#video,2025-10-31 00:00:00+00:00,und
4,"Moyo Wangu (feat. Atenda Chinx, Shona Prince &...",LearnZimbabweVEVO,13,-1,6,2025-11-29,ZW,134404,3060,221,https://i.ytimg.com/vi/YbrwIWz4OSc/mqdefault.jpg,YbrwIWz4OSc,UCx5GZ0CqFAfEo7QvsjnDGvw,"Learn Zimbabwe, Atenda Chinx, Shona Prince, Ta...",youtube#video,2025-11-17 00:00:00+00:00,ru


## Step 2: Sentiment Analysis

In [19]:
import pandas as pd
from textblob import TextBlob

filename = 'music_data_final.csv'

try:
    df = pd.read_csv(filename)
    print(f"Loaded {len(df)} rows from {filename}")

    # Sentiment Function
    def get_sentiment(text):
        # Convert to string to avoid errors on empty titles
        text = str(text) 
        # Returns a number between -1.0 and 1.0
        return TextBlob(text).sentiment.polarity

    print("Calculating sentiment scores... (This takes about 1-2 minutes)")
    df['sentiment_score'] = df['title'].apply(get_sentiment)

    print("Done! Here is a sample of your enriched data:")
    display(df[['title', 'sentiment_score']].head(10)) # Try a small portion

except FileNotFoundError:
    print(f"Error: Could not find '{filename}'. Did the previous repair step finish successfully?")

Loaded 336135 rows from music_data_final.csv
Calculating sentiment scores... (This takes about 1-2 minutes)
Done! Here is a sample of your enriched data:


Unnamed: 0,title,sentiment_score
0,Raymer - Kapoto (Official Music Video),0.0
1,Mambo Dhuterere X @FreemanHKDBOSS - Nhovo Vis...,0.0
2,Bling4 - PaHarare (Official Music Video),0.0
3,"Killer T, Xiba - Bhiya (Official Video)",0.0
4,"Moyo Wangu (feat. Atenda Chinx, Shona Prince &...",0.0
5,"Bagga, Jnr Spragga - 1 Clan/Mafia [Official Vi...",0.0
6,NASHIE ZIM - APA (official Music Video),0.0
7,Prince Chigwida - Mazambara (Official Music Vi...,0.0
8,Nisha Ts & Raymer - Hello Mwari (Official Mus...,0.0
9,Mr Attention-Handi Nyore (Ga Ga 2 Minutes) Off...,0.0


## Step 3: Aggregate by Country

In [20]:
# Group by 'country' and calculate the average score
country_music_sentiment = df.groupby('country')['sentiment_score'].mean().reset_index()

country_music_sentiment.columns = ['Country_Code', 'Avg_Music_Sentiment']

# Sort to see who listens to the happiest music
country_music_sentiment = country_music_sentiment.sort_values('Avg_Music_Sentiment', ascending=False)

print("--- National Music Sentiment Scores ---")
display(country_music_sentiment.head())

# Save the table
country_music_sentiment.to_csv('country_music_scores.csv', index=False)
print("Saved aggregated data to 'country_music_scores.csv'")

--- National Music Sentiment Scores ---


Unnamed: 0,Country_Code,Avg_Music_Sentiment
82,PH,0.162485
59,LK,0.149765
77,NZ,0.13387
33,GE,0.104214
83,PK,0.083081


Saved aggregated data to 'country_music_scores.csv'


## Step 4: The "Matching" Check

In [30]:
import pandas as pd

filename = 'world_happiness_2024.csv'

try:
    # 1. Try opening with the Python engine which can auto-detect separators
    print(f"Attempting to load {filename}...")
    happiness_df = pd.read_csv(filename, sep=None, engine='python')
    
    print("Success! Data loaded.")
    display(happiness_df.head(3))

except Exception as e:
    print("First attempt failed. Trying alternative encodings...")
    try:
        # 2. If that fails, try a different encoding
        happiness_df = pd.read_csv(filename, encoding='latin1')
        print("Success with Latin1 encoding!")
        display(happiness_df.head(3))
        
    except Exception as e2:
        print(f"CRITICAL ERROR: Could not read the file. Details: {e2}")

Attempting to load world_happiness_2024.csv...
Success! Data loaded.
Columns found: ['Ranking', 'Country', 'Regional indicator', 'Ladder score', 'GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']


Unnamed: 0,Ranking,Country,Regional indicator,Ladder score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,140,Afghanistan,South Asia,1721,293451,0,62,0,22638,15383
1,86,Albania,Central and Eastern Europe,53042,671748,57133,74,79892,34403,8517
2,84,Argelia,Sub-Saharan Africa,53635,618327,73652,72,28611,22771,34775
