# Parse .txt files and insert in databases
The lyrics files are cleaned and then the songs/lyrics are added to the database

In [None]:
import pandas as pd
import os
import pymysql
import re
from os import walk
import unidecode
from pymysql import DataError

In [None]:
FOLDER_IMG = "/home/tanguy/data/lyrizz/images"
FOLDER_CSV = "/home/tanguy/data/lyrizz/csv"
FOLDER_TXT = '/home/tanguy/data/lyrizz/txt'

In [None]:
df_tracks = pd.read_csv(os.path.join(FOLDER_CSV, 'df_tracks.csv'), sep=';')
filenames = next(walk(FOLDER_TXT), (None, None, []))[2]
list_track = [track_id[:-4] for track_id in filenames]

tracks = df_tracks[df_tracks['track_id'].isin(list_track)]

### Functions definition

In [None]:
# Sometimes section is indicated in lyrics
def get_section_from_line(line):
    line = line.lower()
    if 'parole' in line or 'lyric' in line:
        return ''
    elif 'intro' in line:
        return 'intro'
    elif 'refrain' in line or 'chorus' in line:
        return 'chorus'
    elif 'couplet' in line or 'verse' in line:
        return 'verse'
    elif 'outro' in line:
        return 'outro'
    elif 'bridge' in line or 'pont' in line:
        return 'bridge'
    elif 'break' in line or 'pause' in line:
        return 'break'
    elif 'hook' in line or 'crochet' in line:
        return 'hook'
    else:
        return ''

In [None]:
def filter_title(name):
    # Try de remove "- Remastered ..."
    name = name.split(' - ')[0]
    
    # Try de remove " (Remastered ...)"
    name = name.split('(')[0]

    # Remove space at begin/end
    name = name.strip()
    return name

def filter_artist(name):
    # Try de remove others artists
    name = name.split(',')[0]
    
    # Try de remove " (Feat ...)"
    name = name.split('(')[0]

    # Remove space at begin/end
    name = name.strip()
    return name

### Insert song and lyrics into the database

In [None]:
### Insert in database
LIST_IMG = next(walk(FOLDER_IMG), (None, None, []))[2]
connection = pymysql.connect(host='localhost',user='django2',password='password',db='quizz_db',
                             charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
for i in range(len(tracks)):
    raw = tracks.iloc[i]
    track_id = raw['track_id']
    filename = os.path.join(FOLDER_TXT, f'{track_id}.txt')
    with open(filename) as f:
        lines = f.readlines()

    if len(lines)<10:
        print(track_id, 'No lyrics')
        continue
        
    ##########################################
    # Add song in database
    ##########################################
    track_id = raw['track_id']
    artists = raw['artists']
    name = raw['name']
    
    artists = filter_artist(artists)
    name = filter_title(name)
    
    popularity = raw['popularity']
    year = int(raw['release_date'].split('-')[0])
    
    image = [x for x in LIST_IMG if x.startswith(track_id)][0]
    
    print(track_id, end=' ')
    with connection.cursor() as cursor:
        cursor.execute(f"SELECT id from lyrizz_song WHERE spotify_id = '{track_id}'")
        res = cursor.fetchall()
    connection.commit()
    
    if len(res) == 0:
        with connection.cursor() as cursor:
            # Create a new record
            sql = "INSERT INTO `lyrizz_song` (`spotify_id`, `name`, `artists`, `popularity`, `year`, `image`, `has_quote`, `has_image`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
            try:
                cursor.execute(sql, (track_id, name[:200], artists[:200], int(popularity), year, f'covers_lyrizz/{track_id}.jpg', 1, 1))
            except DataError:
                cursor.execute(sql, (track_id, unidecode.unidecode(name[:200]), unidecode.unidecode(artists[:200]), int(popularity), year, f'covers_lyrizz/{track_id}.jpg', 1, 1))
            song_id = cursor.lastrowid
        connection.commit()
    else:
        song_id = res[0]['id']
        print('ALREADY')
        continue
        
        
    section = ''
    count=0
    for l in lines:
        if l[0] == '[':
            section = get_section_from_line(l)
            continue
        if l=='\n':
            continue
        l = l.replace('\n', '')
        l = unidecode.unidecode(l)
        ##########################################    
        # Add lyrics in database
        ##########################################

        with connection.cursor() as cursor:
            # Create a new record
            sql = "INSERT INTO `lyrizz_lyrics` (`lyrics_text`, `section`, `song_id`) VALUES (%s, %s, %s)"
            try:
                cursor.execute(sql, (l, section, song_id))
                count+=1
            except:
                cursor.execute(sql, (unidecode.unidecode(l), section, song_id))
                print('U',end='')

        connection.commit()
        
    print(' ', count)
connection.close()

### Insert image name in database

In [None]:
### update image name in db
LIST_IMG = next(walk(FOLDER_IMG), (None, None, []))[2]
connection = pymysql.connect(host='localhost',user='django2',password='password',db='quizz_db',
                             charset='utf8mb4',cursorclass=pymysql.cursors.DictCursor)
with connection.cursor() as cursor:
    cursor.execute("SELECT spotify_id from lyrizz_song")
    res = cursor.fetchall()
connection.commit()

for track in res:
    track_id = track['spotify_id']
    image = [x for x in LIST_IMG if x.startswith(track_id)][0]

    with connection.cursor() as cursor:
        cursor.execute(f"UPDATE lyrizz_song SET image='covers_lyrizz/{image}' WHERE spotify_id='{track_id}'")
    connection.commit()
        
    print(image)
    
connection.close()