# DOWNLOADING ALL REQUIRED MODULES FROM NATURAL LANGUAGE TOOLKIT  FOR PREPROSESSING THE SONG LYRICS

In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
import os

# Ensure necessary NLTK data packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gnand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gnand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gnand\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\gnand\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# PREPOCESSING LYRICS

In [32]:
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_lyrics(lyrics):
    # Tokenize
    words = word_tokenize(lyrics)
    # Convert to lower case, remove punctuation, and filter out stop words
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    # Lemmatize
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)



# GETTING THE DIRECTORY AND COMBINING ALL THE CSV FILES

In [43]:
# Define the directory containing the CSV files
directory = "SONGS"

# Create an empty list to hold the dataframes
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        dataframes.append(df)

# Combine all dataframes into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.head()

Unnamed: 0.1,Artist,Title,Album,Date,Lyric,Year,Unnamed: 0
0,Ariana Grande,"​thank u, next","thank u, next",2018-11-03,thought i'd end up with sean but he wasn't a m...,2018,
1,Ariana Grande,7 rings,"thank u, next",2019-01-18,yeah breakfast at tiffany's and bottles of bub...,2019,
2,Ariana Grande,​God is a woman,Sweetener,2018-07-13,you you love it how i move you you love it how...,2018,
3,Ariana Grande,Side To Side,Dangerous Woman,2016-05-20,ariana grande nicki minaj i've been here all ...,2016,
4,Ariana Grande,​​no tears left to cry,Sweetener,2018-04-20,right now i'm in a state of mind i wanna be in...,2018,


# IDENTIFYING AND HANDLING MISSING VALUES

In [34]:
# Check for missing values
combined_df.isnull().sum()

# Handle missing values (e.g., drop rows with missing lyrics)
combined_df.dropna(subset=['Lyric'], inplace=True)



# DROP UNNECESSARY  COLUMNS

In [41]:
combined_df = combined_df.drop(columns=['unnamed:_0'])

# STANDARDIZATION OF COLUMNS AND PREPROCESSING ODF LYRICS

In [42]:
# Standardize column names
combined_df.columns = combined_df.columns.str.lower().str.replace(' ', '_')

# Check and remove duplicates
combined_df.drop_duplicates(inplace=True)

# Apply preprocessing to the lyrics column
combined_df['cleaned_lyrics'] = combined_df['lyric'].apply(preprocess_lyrics)

combined_df.head()

Unnamed: 0,artist,title,album,date,lyric,year,cleaned_lyrics
0,Ariana Grande,"​thank u, next","thank u, next",2018-11-03,thought i'd end up with sean but he wasn't a m...,2018,thought end sean match wrote song ricky listen...
1,Ariana Grande,7 rings,"thank u, next",2019-01-18,yeah breakfast at tiffany's and bottles of bub...,2019,yeah breakfast tiffany bottle bubble girl tatt...
2,Ariana Grande,​God is a woman,Sweetener,2018-07-13,you you love it how i move you you love it how...,2018,love move love touch one said done believe god...
3,Ariana Grande,Side To Side,Dangerous Woman,2016-05-20,ariana grande nicki minaj i've been here all ...,2016,ariana grande nicki minaj night ariana day nic...
4,Ariana Grande,​​no tears left to cry,Sweetener,2018-04-20,right now i'm in a state of mind i wanna be in...,2018,right state mind wan na like time ai got tear ...


# FEATURE EXTRACTION WITH TFIDF*(Term Frequency-Inverse Document Frequency.)

In [36]:
# Continue with the feature extraction and model building steps
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Feature Extraction with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_df['cleaned_lyrics'])



# APPLY COSINE SIMILARITY

In [37]:
# Compute Similarity Matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)



# FUNCTION FOR RECCOMENDATION

In [38]:
# Create a Function for Recommendations
def recommend_songs(song_title, top_n=5):
    # Find the index of the song in the dataframe
    song_idx = combined_df[combined_df['title'] == song_title].index[0]
    # Get the similarity scores for this song
    sim_scores = list(enumerate(cosine_sim[song_idx]))
    # Sort by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the indices of the top_n most similar songs
    top_indices = [idx for idx, score in sim_scores[1:top_n+1]]
    # Return the titles of the most similar songs
    return combined_df.iloc[top_indices][['artist', 'title']]



# PRINTING THE RECCOMENDED SONGS

In [39]:

# Example usage
rsongs=recommend_songs('7 rings')
print("RECOMENDED TOP 5 SONGS ARE:\n")
print(rsongs)


RECOMENDED TOP 5 SONGS ARE:

             artist                                              title
133   Ariana Grande                                    7 rings (live)​
76    Ariana Grande                                    7 rings (Remix)
136   Ariana Grande  ​imagine / My Favorite Things / 7 rings / than...
4229      Lady Gaga  Do What U Want (Red Ant & Amp Lexvas Deep Hous...
3671     Katy Perry                                That’s More Like It
