In [156]:
import pandas as pd

from collections import Counter
from datetime import datetime

In [157]:
# 크롤링 원본 데이터 파일
filename = "./csv_file/melon_chart.csv"
raw_data = pd.read_csv(filename)

In [158]:
raw_data.head()


Unnamed: 0,rank,title,singer,album,date,like
0,1,Perfect Night,LE SSERAFIM (르세라핌),Perfect Night,2024.01.01.01,114191
1,2,Drama,aespa,Drama - The 4th Mini Album,2024.01.01.01,85818
2,3,To. X,태연 (TAEYEON),To. X - The 5th Mini Album,2024.01.01.01,143931
3,4,비의 랩소디,임재현,비의 랩소디,2024.01.01.01,68931
4,5,첫 눈,EXO,겨울 스페셜 앨범 '12월의 기적 (Miracles In December)',2024.01.01.01,287040


In [159]:
data = raw_data
data[['year','month','day','week']] = data['date'].str.split('.',expand=True)
data['year'] = data['year'].astype('int')
data['month'] = data['month'].astype('int')
data['day'] = data['day'].astype('int')
data['week'] = data['week'].astype('int')
data['season'] = data['month'].apply(lambda x : 1 if x in [1,2,11,12] else 0) # 1 : 겨울 / 0 : 봄 
data['date'] = data['date'].apply(lambda x : x.rsplit('.', 1)[0])

data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,1,Perfect Night,LE SSERAFIM (르세라핌),Perfect Night,2024.01.01,114191,2024,1,1,1,1
1,2,Drama,aespa,Drama - The 4th Mini Album,2024.01.01,85818,2024,1,1,1,1
2,3,To. X,태연 (TAEYEON),To. X - The 5th Mini Album,2024.01.01,143931,2024,1,1,1,1
3,4,비의 랩소디,임재현,비의 랩소디,2024.01.01,68931,2024,1,1,1,1
4,5,첫 눈,EXO,겨울 스페셜 앨범 '12월의 기적 (Miracles In December)',2024.01.01,287040,2024,1,1,1,1


In [154]:
# 요구사항에 따른 1,2 월은 year-1년 으로 처리.
def adjust_year(df):
    if df['month'] in [1,2]:
        return df['year']-1
    elif df['month'] == 12 :
        return df['year']
    else :
        return df['year']

data['after_year'] = data.apply(adjust_year,axis=1)

In [160]:
# 피쳐 추가한 데이터프레임 
data.to_csv("./csv_file/melon_chart_raw_data.csv",index=False)

# 0. 동일년도 봄에도 등장하고, 겨울에도 등장하는 노래 조회.

In [103]:
spring_songs = data[data['month'].isin([3,4,5])]
winter_songs = data[data['month'].isin([1,12])] # 요구사항에 따른 2월 제외.

In [104]:
print(spring_songs.shape,winter_songs.shape)

(18174, 12) (12889, 12)


In [105]:
# 동일한 년도에 등장한 노래
# both_table = pd.merge(spring_songs,winter_songs,on=['title','year'])
both_table = pd.merge(spring_songs,winter_songs,on=['title','after_year'])

In [106]:
both_table

Unnamed: 0,rank_x,title,singer_x,album_x,date_x,like_x,year_x,month_x,day_x,week_x,...,rank_y,singer_y,album_y,date_y,like_y,year_y,month_y,day_y,week_y,season_y
0,1,Ditto,NewJeans,NewJeans 'OMG',2023.03.06,281587,2023,3,6,2,...,17,NewJeans,NewJeans 'OMG',2024.01.01,281588,2024,1,1,1,1
1,1,Ditto,NewJeans,NewJeans 'OMG',2023.03.06,281587,2023,3,6,2,...,17,NewJeans,NewJeans 'OMG',2024.01.08,281588,2024,1,8,2,1
2,1,Ditto,NewJeans,NewJeans 'OMG',2023.03.06,281587,2023,3,6,2,...,21,NewJeans,NewJeans 'OMG',2024.01.15,281588,2024,1,15,3,1
3,1,Ditto,NewJeans,NewJeans 'OMG',2023.03.06,281587,2023,3,6,2,...,22,NewJeans,NewJeans 'OMG',2024.01.22,281588,2024,1,22,4,1
4,1,Ditto,NewJeans,NewJeans 'OMG',2023.03.06,281587,2023,3,6,2,...,25,NewJeans,NewJeans 'OMG',2024.01.29,281588,2024,1,29,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19077,66,Because Of You,거미,Loveless,2010.05.09,12457,2010,5,9,2,...,76,김보경 (NEON),Because Of You,2010.12.12,4350,2010,12,12,3,1
19078,62,Because Of You,거미,Loveless,2010.05.16,12457,2010,5,16,3,...,51,김보경 (NEON),Because Of You,2010.12.05,4350,2010,12,5,2,1
19079,62,Because Of You,거미,Loveless,2010.05.16,12457,2010,5,16,3,...,76,김보경 (NEON),Because Of You,2010.12.12,4350,2010,12,12,3,1
19080,62,Because Of You,거미,Loveless,2010.05.23,12457,2010,5,23,4,...,51,김보경 (NEON),Because Of You,2010.12.05,4350,2010,12,5,2,1


In [109]:
# 원본데이터에서 동일한 년도에 등장한 노래 제거
joined_data = pd.merge(data, both_table, left_on=['title', 'singer','after_year'], right_on=['title', 'singer_x','after_year'], how='left')
result_data = joined_data[pd.isna(joined_data['singer_x'])] # 동일한 년도에 등장하지 않았다면 singer_x 가 nan 일 것 이기 떄문에. 

In [110]:
result_data.shape

(32419, 32)

In [111]:
result_data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,...,rank_y,singer_y,album_y,date_y,like_y,year_y,month_y,day_y,week_y,season_y
0,1,Perfect Night,LE SSERAFIM (르세라핌),Perfect Night,2024.01.01,114191,2024,1,1,1,...,,,,,,,,,,
1,2,Drama,aespa,Drama - The 4th Mini Album,2024.01.01,85818,2024,1,1,1,...,,,,,,,,,,
2,3,To. X,태연 (TAEYEON),To. X - The 5th Mini Album,2024.01.01,143931,2024,1,1,1,...,,,,,,,,,,
3,4,비의 랩소디,임재현,비의 랩소디,2024.01.01,68931,2024,1,1,1,...,,,,,,,,,,
4,5,첫 눈,EXO,겨울 스페셜 앨범 '12월의 기적 (Miracles In December)',2024.01.01,287040,2024,1,1,1,...,,,,,,,,,,


In [112]:
# 필요한 행만 추출 
result_data = result_data[['rank', 'title', 'singer', 'album', 'date', 'like', 'year','after_year', 'month',
       'day', 'week', 'season']]

In [113]:
result_data

Unnamed: 0,rank,title,singer,album,date,like,year,after_year,month,day,week,season
0,1,Perfect Night,LE SSERAFIM (르세라핌),Perfect Night,2024.01.01,114191,2024,2023,1,1,1,1
1,2,Drama,aespa,Drama - The 4th Mini Album,2024.01.01,85818,2024,2023,1,1,1,1
2,3,To. X,태연 (TAEYEON),To. X - The 5th Mini Album,2024.01.01,143931,2024,2023,1,1,1,1
3,4,비의 랩소디,임재현,비의 랩소디,2024.01.01,68931,2024,2023,1,1,1,1
4,5,첫 눈,EXO,겨울 스페셜 앨범 '12월의 기적 (Miracles In December)',2024.01.01,287040,2024,2023,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
481964,96,그날 이후로,나윤권,김형석 With 나윤권,2010.12.26,2379,2010,2010,12,26,5,1
481965,97,I'll Be Back,2PM,Still 2:00pm,2010.12.26,9493,2010,2010,12,26,5,1
481966,98,널 지우는 일,김지수,널 지우는 일,2010.12.26,1635,2010,2010,12,26,5,1
481967,99,Thanks To,"용준형, 양요섭",My Story,2010.12.26,18397,2010,2010,12,26,5,1


In [115]:
result_data = result_data.drop('year',axis=1).rename(columns={'after_year':'year'})

In [117]:
result_data.to_csv("./csv_file/except_both_title.csv",index=False)

In [53]:
data.loc[(data.title == '벚꽃 엔딩')&(data.season == 1)]

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
17986,87,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2017.02.27.01,345228,2017,2,27,1,1
20471,75,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2016.02.29.01,345228,2016,2,29,1,1
23091,95,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2015.02.23.04,345230,2015,2,23,4,1
25789,93,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2014.02.24.04,345231,2014,2,24,4,1
28382,87,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2013.02.25.04,345231,2013,2,25,4,1


# 1. 연도-계절 별 차트 등장회수 TOP5 조회

In [118]:
data = pd.read_csv("./csv_file/except_both_title.csv")
data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,1,Perfect Night,LE SSERAFIM (르세라핌),Perfect Night,2024.01.01,114191,2023,1,1,1,1
1,2,Drama,aespa,Drama - The 4th Mini Album,2024.01.01,85818,2023,1,1,1,1
2,3,To. X,태연 (TAEYEON),To. X - The 5th Mini Album,2024.01.01,143931,2023,1,1,1,1
3,4,비의 랩소디,임재현,비의 랩소디,2024.01.01,68931,2023,1,1,1,1
4,5,첫 눈,EXO,겨울 스페셜 앨범 '12월의 기적 (Miracles In December)',2024.01.01,287040,2023,1,1,1,1


In [119]:
# 연도별 계절에 따른 가장 많이 등장한 title,singer,date 
# date ???

# 결과를 저장할 빈 데이터 프레임을 생성
columns=['year', 'season','title', 'singer', 'count']
result_df = pd.DataFrame(columns=columns)

temp = []

# grouped_data = data.groupby(['year','season','title','singer']).size().reset_index(name='count')
grouped_data = data.groupby(['year','season'])

for name,group in grouped_data:
    year,season = name

    # 노래 제목과 가수로 튜플로 만들어 등장회수 세기.
    song_counter = Counter(zip(group['title'],group['singer'])) # {(노래1,가수1):2,(노래2,가수1):1, ....}
    most_song = song_counter.most_common(5) # 상위 5개 [((노래1,가수1),2).... ]

    for (title,singer),count in most_song:
        temp_df = pd.DataFrame(
            {
                'year' : [year],
                'season' : [season],
                'title' : [title],
                'singer' : [singer],
                'count' : [count]
            }
        )
        temp.append(temp_df)
result_df = pd.concat(temp,ignore_index=True)
        



In [120]:
result_df

Unnamed: 0,year,season,title,singer,count
0,2009,1,우리 사랑하게 됐어요.,"가인, 조권",9
1,2009,1,너 때문에,애프터스쿨,9
2,2009,1,Bo Peep Bo Peep,티아라,9
3,2009,1,오늘 헤어졌어요,윤하 (YOUNHA),9
4,2009,1,처음처럼 그때처럼 (Feat. 강민경 From Davichi),이승기,9
...,...,...,...,...,...
145,2024,0,밤양갱,비비 (BIBI),6
146,2024,0,첫 만남은 계획대로 되지 않아,TWS (투어스),6
147,2024,0,Love wins all,아이유,6
148,2024,0,EASY,LE SSERAFIM (르세라핌),6


# 2. 3년 이상 연속으로 등장한 노래 추출

In [21]:
data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,43,잠시라도 우리,"성시경, 나얼",잠시라도 우리,2024.01.01.01,39656,2024,1,1,1,1
1,50,Chill Kill,Red Velvet (레드벨벳),Chill Kill - The 3rd Album,2024.01.01.01,42790,2024,1,1,1,1
2,56,화이트 (White),폴킴,화이트 (White),2024.01.01.01,18814,2024,1,1,1,1
3,57,눈이 오잖아(Feat.헤이즈),이무진,눈이 오잖아(Feat.헤이즈),2024.01.01.01,135142,2024,1,1,1,1
4,60,GODS,"NewJeans, League of Legends",2023 리그 오브 레전드 월드 챔피언십 주제곡,2024.01.01.01,46175,2024,1,1,1,1


In [22]:
def is_consecutive(years_set):
    sorted_years = sorted(years_set) # 연도 오름차순 정렬
    consecutive_count = 1 # 최소 1년은 등장하므로 1로 초기화.
    max_consecutive = 1 # 연속연도 count 변수

    for i in range(len(sorted_years)-1):
        # 현재연도와 다음연도가 연속적인지 확인. 2023+1 == 2024 
        if sorted_years[i] + 1 == sorted_years[i + 1] :
            consecutive_count += 1
            max_consecutive = max(max_consecutive,consecutive_count)
        else : 
            consecutive_count = 1
    return max_consecutive >= 3 # 3년연속 등장여부

In [23]:
# 연도를 기준으로 등장 연도 조회 ( 중복제거를 위해 set 사용 )
grouped_data = data.groupby(['title','singer'])['year'].apply(set).reset_index()

In [24]:
# 3년이상 연속출현 여부 , 1 : 출현 0 : 미출현
grouped_data['more_than_3'] = grouped_data['year'].apply(lambda x : 1 if is_consecutive(x) else 0 )

In [25]:
# top100 차트인 했던 연도수 리스트컬럼 
grouped_data['year'] = grouped_data['year'].apply(lambda x :list(x))

In [26]:
result_df_2 = grouped_data.rename(columns = {'year':'years'})

In [27]:
result_df_2.loc[result_df_2.more_than_3 == 1]

Unnamed: 0,title,singer,years,more_than_3
87,All I Want for Christmas Is You,Mariah Carey,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 202...",1
464,Gone,브라운 아이드 소울,"[2010, 2011, 2012]",1
674,Last Christmas,Ariana Grande,"[2018, 2020, 2021, 2022, 2023]",1
795,Make It To Christmas,Alessia Cara,"[2021, 2022, 2023]",1
827,Must Have Love,"SG 워너비, 브라운아이드걸스","[2018, 2019, 2021, 2022, 2023, 2014]",1
1033,Santa Tell Me,Ariana Grande,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]",1
1081,Snowman,Sia,"[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]",1
1162,Text Me Merry Christmas (Feat. Kristen Bell),Straight No Chaser,"[2019, 2020, 2021, 2022, 2023]",1
1232,Underneath The Tree,Kelly Clarkson,"[2021, 2022, 2023]",1
1746,꽃송이가,버스커 버스커,"[2016, 2012, 2013, 2014, 2015]",1


# 2-1. Add : 시즌에 따라 테이블 분류 (봄테이블, 겨울테이블)

In [28]:
# 봄에 차트인 했던 노래 가운데 3년 연속 등장했던 노래 목록
spring_season = data.loc[data.season == 0 ]
spring_grouped = spring_season.groupby(['title','singer'])['year'].apply(set).reset_index()
spring_grouped['more_than_3'] = spring_grouped['year'].apply(lambda x : 1 if is_consecutive(x) else 0 )
spring_grouped['year'] = spring_grouped['year'].apply(lambda x :list(x))
spring_grouped_rs = spring_grouped.rename(columns = {'year':'years'})
spring_grouped_rs

Unnamed: 0,title,singer,years,more_than_3
0,#결별,"길구봉구, 박보람",[2018],0
1,...하고 싶다,V.One,[2010],0
2,..IS YOU,XIA (준수),[2016],0
3,0330,유키스,[2011],0
4,100 Percent,이효리,[2010],0
...,...,...,...,...
1895,흩어져,먼데이 키즈 (Monday Kiz),[2010],0
1896,희나리,최준영,[2012],0
1897,희야,하예나,[2012],0
1898,희재,이수영,[2013],0


In [29]:
spring_grouped_rs.loc[spring_grouped_rs.more_than_3 == 1 ]

Unnamed: 0,title,singer,years,more_than_3
800,꽃송이가,버스커 버스커,"[2016, 2012, 2013, 2014, 2015]",1
1190,벚꽃 엔딩,버스커 버스커,"[2018, 2019, 2020, 2021, 2022, 2023, 2024, 2012]",1
1216,봄 사랑 벚꽃 말고,"HIGH4 (하이포), 아이유","[2016, 2017, 2018, 2020, 2021, 2022, 2023, 202...",1
1231,봄이 좋냐??,10CM,"[2016, 2017, 2018]",1
1621,우연히 봄,"로꼬, 유주 (YUJU)","[2016, 2020, 2022, 2023, 2024, 2015]",1


In [30]:
# 겨울에 차트인 했던 노래 가운데 3년 연속 등장했던 노래 목록
winter_season = data.loc[data.season == 1 ]
winter_grouped = winter_season.groupby(['title','singer'])['year'].apply(set).reset_index()
winter_grouped['more_than_3'] = winter_grouped['year'].apply(lambda x : 1 if is_consecutive(x) else 0 )
winter_grouped['year'] = winter_grouped['year'].apply(lambda x :list(x))
winter_grouped_rs = winter_grouped.rename(columns = {'year':'years'})
winter_grouped_rs

Unnamed: 0,title,singer,years,more_than_3
0,0310,백예린 (Yerin Baek),[2019],0
1,"08베이식 (Feat. 염따, punchnello)",베이식 (Basick),[2021],0
2,"1,2,3,4 (원,투,쓰리,포)",이하이,[2012],0
3,11:11,태연 (TAEYEON),[2016],0
4,12:45 (Stripped),Etham,[2020],0
...,...,...,...,...
2385,히치하이킹 (Hitchhiking),SHINee (샤이니),[2013],0
2386,힐링이 필요해,로이킴,"[2012, 2013]",0
2387,힘든 건 사랑이 아니다,임창정,[2020],0
2388,"힘을 내요, 그대",더필름,[2011],0


In [31]:
winter_grouped_rs.loc[winter_grouped_rs.more_than_3 == 1 ]

Unnamed: 0,title,singer,years,more_than_3
43,All I Want for Christmas Is You,Mariah Carey,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 202...",1
243,Gone,브라운 아이드 소울,"[2010, 2011, 2012]",1
363,Last Christmas,Ariana Grande,"[2018, 2020, 2021, 2022, 2023]",1
424,Make It To Christmas,Alessia Cara,"[2021, 2022, 2023]",1
441,Must Have Love,"SG 워너비, 브라운아이드걸스","[2018, 2019, 2021, 2022, 2023, 2014]",1
557,Santa Tell Me,Ariana Grande,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]",1
580,Snowman,Sia,"[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]",1
627,Text Me Merry Christmas (Feat. Kristen Bell),Straight No Chaser,"[2019, 2020, 2021, 2022, 2023]",1
664,Underneath The Tree,Kelly Clarkson,"[2021, 2022, 2023]",1
1419,미리 메리 크리스마스 (Feat. 천둥 Of MBLAQ),아이유,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 202...",1


In [32]:
spring_grouped_rs.loc[spring_grouped_rs.more_than_3 == 1 ].to_csv("./csv_file/spring_more_than3.csv",index=False)
winter_grouped_rs.loc[winter_grouped_rs.more_than_3 == 1 ].to_csv("./csv_file/winter_more_than3.csv",index=False)



# 3. 각 노래별 최초 등장일 조회

In [76]:
data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,43,잠시라도 우리,"성시경, 나얼",잠시라도 우리,2024.01.01.01,39656,2024,1,1,1,1
1,50,Chill Kill,Red Velvet (레드벨벳),Chill Kill - The 3rd Album,2024.01.01.01,42790,2024,1,1,1,1
2,56,화이트 (White),폴킴,화이트 (White),2024.01.01.01,18814,2024,1,1,1,1
3,57,눈이 오잖아(Feat.헤이즈),이무진,눈이 오잖아(Feat.헤이즈),2024.01.01.01,135142,2024,1,1,1,1
4,60,GODS,"NewJeans, League of Legends",2023 리그 오브 레전드 월드 챔피언십 주제곡,2024.01.01.01,46175,2024,1,1,1,1


In [18]:
data = raw_data
data['start_date'] = data['date'].apply(lambda x : x.split('~')[0].strip())
data['start_date'] = pd.to_datetime(data['start_date'])

result_df_3 = data.groupby(['title','singer'])['start_date'].min().reset_index()

In [19]:
result_df_3

Unnamed: 0,title,singer,start_date
0,#결별,"길구봉구, 박보람",2018-04-16
1,#첫사랑,볼빨간사춘기,2018-01-08
2,%% (응응),Apink (에이핑크),2019-01-07
3,...하고 싶다,V.One,2010-03-07
4,..IS YOU,XIA (준수),2016-05-16
...,...,...,...
4961,히히하헤호,"마마무 (Mamamoo), 긱스 (Geeks)",2014-05-26
4962,힐링이 필요해,로이킴,2012-12-03
4963,힘든 건 사랑이 아니다,임창정,2020-11-30
4964,"힘을 내요, 그대",더필름,2011-01-02


# 4. 3년 이상 등장한 음원 목록 조회.

In [33]:
data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,43,잠시라도 우리,"성시경, 나얼",잠시라도 우리,2024.01.01.01,39656,2024,1,1,1,1
1,50,Chill Kill,Red Velvet (레드벨벳),Chill Kill - The 3rd Album,2024.01.01.01,42790,2024,1,1,1,1
2,56,화이트 (White),폴킴,화이트 (White),2024.01.01.01,18814,2024,1,1,1,1
3,57,눈이 오잖아(Feat.헤이즈),이무진,눈이 오잖아(Feat.헤이즈),2024.01.01.01,135142,2024,1,1,1,1
4,60,GODS,"NewJeans, League of Legends",2023 리그 오브 레전드 월드 챔피언십 주제곡,2024.01.01.01,46175,2024,1,1,1,1


In [21]:
# 중복 연도수 제거
grouped_data = data.groupby(['title','singer'])['year'].apply(set).reset_index()
grouped_data['year'] = grouped_data['year'].apply(lambda x : list(x))
grouped_data.head()

Unnamed: 0,title,singer,year
0,#결별,"길구봉구, 박보람",[2018]
1,#첫사랑,볼빨간사춘기,[2018]
2,%% (응응),Apink (에이핑크),[2019]
3,...하고 싶다,V.One,[2010]
4,..IS YOU,XIA (준수),[2016]


In [28]:
# 등장년도 회수 칼럼
grouped_data['chartin_counts'] = grouped_data['year'].apply(lambda x : len(x))

# 등장년도 3이상 음원 조회
result_df_4 = grouped_data[grouped_data['chartin_counts'] >= 3]



Unnamed: 0,title,singer,years,chartin_counts
38,2002,Anne-Marie,"[2019, 2020, 2021]",3
69,A,Ariana Grande,"[2018, 2014, 2015]",3
79,A,Maroon 5,"[2016, 2017, 2018, 2011, 2012, 2014, 2015]",7
108,A,이해리 (다비치),"[2012, 2013, 2014]",3
115,A bientot,임영웅,"[2024, 2022, 2023]",3
...,...,...,...,...
4696,크리스마스니까,"성시경, 박효신, 이석훈, 서인국, VIXX (빅스)","[2016, 2017, 2018, 2019, 2020, 2021, 2022, 202...",13
4753,피 땀 눈물,방탄소년단,"[2016, 2017, 2018]",3
4820,한숨,이하이,"[2016, 2017, 2018]",3
4838,해요 (2022),#안녕,"[2024, 2022, 2023]",3


# 4-1 봄/겨울 구분 3번 이상 등장한 데이터 조회.

In [121]:
data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,1,Perfect Night,LE SSERAFIM (르세라핌),Perfect Night,2024.01.01,114191,2023,1,1,1,1
1,2,Drama,aespa,Drama - The 4th Mini Album,2024.01.01,85818,2023,1,1,1,1
2,3,To. X,태연 (TAEYEON),To. X - The 5th Mini Album,2024.01.01,143931,2023,1,1,1,1
3,4,비의 랩소디,임재현,비의 랩소디,2024.01.01,68931,2023,1,1,1,1
4,5,첫 눈,EXO,겨울 스페셜 앨범 '12월의 기적 (Miracles In December)',2024.01.01,287040,2023,1,1,1,1


In [34]:
spring_season = data.loc[data.season == 0 ]

In [40]:
spring_season = data.loc[data.season == 0 ]
spring_grouped = spring_season.groupby(['title','singer'])['year'].apply(set).reset_index()
spring_grouped['years'] = spring_grouped['year'].apply(lambda x : list(x))
spring_grouped['chartin_counts'] = spring_grouped['year'].apply(lambda x : len(x))
spring_grouped = spring_grouped[spring_grouped['chartin_counts'] >= 3]
spring_grouped[['title','singer','years','chartin_counts']].to_csv("./csv_file/spring_more3.csv",index=False)

In [41]:
winter_season = data.loc[data.season == 1 ]
winter_grouped = winter_season.groupby(['title','singer'])['year'].apply(set).reset_index()
winter_grouped['years'] = winter_grouped['year'].apply(lambda x : list(x))
winter_grouped['chartin_counts'] = winter_grouped['year'].apply(lambda x : len(x))
winter_grouped = winter_grouped[winter_grouped['chartin_counts'] >= 3]
winter_grouped[['title','singer','years','chartin_counts']].to_csv("./csv_file/winter_more3.csv",index=False)

# 4-2. 겨울처리 후 데이터 조회
> 겨울은 다음과 같이 정의하였음.<br>  23년 겨울 = [23년12월, 24년1월, 24년2월] <br> 22년 겨울 = [22년12월, 23년1월, 23년2월]



In [6]:
data.head()

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,43,잠시라도 우리,"성시경, 나얼",잠시라도 우리,2024.01.01.01,39656,2024,1,1,1,1
1,50,Chill Kill,Red Velvet (레드벨벳),Chill Kill - The 3rd Album,2024.01.01.01,42790,2024,1,1,1,1
2,56,화이트 (White),폴킴,화이트 (White),2024.01.01.01,18814,2024,1,1,1,1
3,57,눈이 오잖아(Feat.헤이즈),이무진,눈이 오잖아(Feat.헤이즈),2024.01.01.01,135142,2024,1,1,1,1
4,60,GODS,"NewJeans, League of Legends",2023 리그 오브 레전드 월드 챔피언십 주제곡,2024.01.01.01,46175,2024,1,1,1,1


In [7]:
#! y년 1월 또는 2월인경우 
# y년 = y-1 로 처리하였음.
def adjust_year(df):
    if df['month'] in [1,2]:
        return df['year']-1
    elif df['month'] == 12 :
        return df['year']
    else :
        return df['year']

data['after_year'] = data.apply(adjust_year,axis=1)

In [125]:
winter_season = data.loc[(data.season == 1)]
# winter_grouped = winter_season.groupby(['title','singer']).agg({
#     'after_year' : lambda x : set(x),
#     'year' : lambda x : set(x)
# }).reset_index()
winter_grouped = winter_season.groupby(['title','singer']).agg({
    'year' : lambda x : set(x)
}).reset_index()
# winter_grouped['after_year'] = winter_grouped['after_year'].apply(lambda x: sorted(list(x)))
winter_grouped['year'] = winter_grouped['year'].apply(lambda x: sorted(list(x)))
# winter_grouped['chartin_counts'] = winter_grouped['after_year'].apply(lambda x: len(x))
winter_grouped['chartin_counts'] = winter_grouped['year'].apply(lambda x: len(x))
winter_grouped = winter_grouped[winter_grouped['chartin_counts'] >= 3]

In [132]:
data.loc[(data.title=='벚꽃 엔딩')&(data.season==1)]

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
13322,87,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2017.02.27,345228,2016,2,27,1,1
15516,75,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2016.02.29,345228,2015,2,29,1,1
17928,95,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2015.02.23,345230,2014,2,23,4,1
20519,93,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2014.02.24,345231,2013,2,24,4,1
23041,87,벚꽃 엔딩,버스커 버스커,버스커 버스커 1집,2013.02.25,345231,2012,2,25,4,1


In [126]:
winter_grouped

Unnamed: 0,title,singer,year,chartin_counts
55,All I Want for Christmas Is You,Mariah Carey,"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...",12
461,Last Christmas,Ariana Grande,"[2018, 2020, 2021, 2022, 2023]",5
462,Last Christmas,WHAM!,"[2016, 2022, 2023]",3
536,Make It To Christmas,Alessia Cara,"[2021, 2022, 2023]",3
558,Must Have Love,"SG 워너비, 브라운아이드걸스","[2014, 2018, 2019, 2021, 2022, 2023]",6
704,Santa Tell Me,Ariana Grande,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]",8
734,Snowman,Sia,"[2017, 2018, 2019, 2020, 2021, 2022, 2023]",7
800,Text Me Merry Christmas (Feat. Kristen Bell),Straight No Chaser,"[2019, 2020, 2021, 2022, 2023]",5
845,Underneath The Tree,Kelly Clarkson,"[2021, 2022, 2023]",3
1792,미리 메리 크리스마스 (Feat. 천둥 Of MBLAQ),아이유,"[2010, 2011, 2012, 2013, 2014, 2015, 2016, 201...",14


In [52]:
winter_grouped[winter_grouped['after_year'].apply(len) != winter_grouped['year'].apply(len)]

Unnamed: 0,title,singer,after_year,year,chartin_counts
43,All I Want for Christmas Is You,Mariah Carey,"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...","[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...",12
580,Snowman,Sia,"[2017, 2018, 2019, 2020, 2021, 2022, 2023]","[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]",7
2233,크리스마스니까,"성시경, 박효신, 이석훈, 서인국, VIXX (빅스)","[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...","[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...",12


In [127]:
spring_season = data.loc[data.season == 0]
# spring_grouped = spring_season.groupby(['title','singer']).agg({
#     'after_year' : lambda x : set(x),
#     'year' : lambda x : set(x)
# }).reset_index()
spring_grouped = spring_season.groupby(['title','singer']).agg({
    'year' : lambda x : set(x)
}).reset_index()
# spring_grouped['after_year'] = spring_grouped['after_year'].apply(lambda x: sorted(list(x)))
spring_grouped['year'] = spring_grouped['year'].apply(lambda x: sorted(list(x)))
# spring_grouped['chartin_counts'] = spring_grouped['after_year'].apply(lambda x: len(x))
spring_grouped['chartin_counts'] = spring_grouped['year'].apply(lambda x: len(x))
spring_grouped = spring_grouped[spring_grouped['chartin_counts'] >= 3]

In [128]:
spring_grouped

Unnamed: 0,title,singer,year,chartin_counts
1305,꽃송이가,버스커 버스커,"[2012, 2013, 2014, 2015, 2016]",5
1344,"나만, 봄",볼빨간사춘기,"[2019, 2020, 2023]",3
1942,벚꽃 엔딩,버스커 버스커,"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...",13
1983,봄 사랑 벚꽃 말고,"HIGH4 (하이포), 아이유","[2014, 2015, 2016, 2017, 2018, 2020, 2021, 202...",10
2001,봄이 좋냐??,10CM,"[2016, 2017, 2018]",3
2512,여수 밤바다,버스커 버스커,"[2012, 2013, 2016]",3
2614,우연히 봄,"로꼬, 유주 (YUJU)","[2015, 2016, 2020, 2022, 2023, 2024]",6


In [53]:
spring_grouped[spring_grouped['after_year'].apply(len) != spring_grouped['year'].apply(len)]

Unnamed: 0,title,singer,after_year,year,chartin_counts


In [130]:
# winter_grouped[['title','singer','after_year','chartin_counts']].rename(columns={'after_year':'years'}).to_csv("./csv_file/winter_mt3_final.csv",index=False)
# spring_grouped[['title','singer','after_year','chartin_counts']].rename(columns={'after_year':'years'}).to_csv("./csv_file/spring_mt3_final.csv",index=False)
winter_grouped[['title','singer','year','chartin_counts']].rename(columns={'year':'years'}).to_csv("./csv_file/winter_mt3_final.csv",index=False)
spring_grouped[['title','singer','year','chartin_counts']].rename(columns={'year':'years'}).to_csv("./csv_file/spring_mt3_final.csv",index=False)

In [134]:
data

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,1,Perfect Night,LE SSERAFIM (르세라핌),Perfect Night,2024.01.01,114191,2023,1,1,1,1
1,2,Drama,aespa,Drama - The 4th Mini Album,2024.01.01,85818,2023,1,1,1,1
2,3,To. X,태연 (TAEYEON),To. X - The 5th Mini Album,2024.01.01,143931,2023,1,1,1,1
3,4,비의 랩소디,임재현,비의 랩소디,2024.01.01,68931,2023,1,1,1,1
4,5,첫 눈,EXO,겨울 스페셜 앨범 '12월의 기적 (Miracles In December)',2024.01.01,287040,2023,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
32414,96,그날 이후로,나윤권,김형석 With 나윤권,2010.12.26,2379,2010,12,26,5,1
32415,97,I'll Be Back,2PM,Still 2:00pm,2010.12.26,9493,2010,12,26,5,1
32416,98,널 지우는 일,김지수,널 지우는 일,2010.12.26,1635,2010,12,26,5,1
32417,99,Thanks To,"용준형, 양요섭",My Story,2010.12.26,18397,2010,12,26,5,1


In [133]:
winter_grouped

Unnamed: 0,title,singer,year,chartin_counts
55,All I Want for Christmas Is You,Mariah Carey,"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...",12
461,Last Christmas,Ariana Grande,"[2018, 2020, 2021, 2022, 2023]",5
462,Last Christmas,WHAM!,"[2016, 2022, 2023]",3
536,Make It To Christmas,Alessia Cara,"[2021, 2022, 2023]",3
558,Must Have Love,"SG 워너비, 브라운아이드걸스","[2014, 2018, 2019, 2021, 2022, 2023]",6
704,Santa Tell Me,Ariana Grande,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]",8
734,Snowman,Sia,"[2017, 2018, 2019, 2020, 2021, 2022, 2023]",7
800,Text Me Merry Christmas (Feat. Kristen Bell),Straight No Chaser,"[2019, 2020, 2021, 2022, 2023]",5
845,Underneath The Tree,Kelly Clarkson,"[2021, 2022, 2023]",3
1792,미리 메리 크리스마스 (Feat. 천둥 Of MBLAQ),아이유,"[2010, 2011, 2012, 2013, 2014, 2015, 2016, 201...",14


In [138]:
spring_filtered_df = pd.DataFrame()
for index,row in spring_grouped.iterrows():
    match_df = data[(data['title']==row['title'])&(data['singer']==row['singer'])]
    spring_filtered_df = pd.concat([spring_filtered_df,match_df],ignore_index=True)
spring_filtered_df

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,76,꽃송이가,버스커 버스커,버스커 버스커 1집,2016.03.21,102898,2016,3,21,4,0
1,55,꽃송이가,버스커 버스커,버스커 버스커 1집,2016.03.28,102898,2016,3,28,5,0
2,59,꽃송이가,버스커 버스커,버스커 버스커 1집,2016.04.04,102898,2016,4,4,1,0
3,79,꽃송이가,버스커 버스커,버스커 버스커 1집,2016.04.11,102898,2016,4,11,2,0
4,94,꽃송이가,버스커 버스커,버스커 버스커 1집,2015.03.16,102898,2015,3,16,3,0
...,...,...,...,...,...,...,...,...,...,...,...
276,11,우연히 봄,"로꼬, 유주 (YUJU)",냄새를 보는 소녀 OST Part.2,2015.04.27,177245,2015,4,27,5,0
277,7,우연히 봄,"로꼬, 유주 (YUJU)",냄새를 보는 소녀 OST Part.2,2015.05.04,177245,2015,5,4,1,0
278,4,우연히 봄,"로꼬, 유주 (YUJU)",냄새를 보는 소녀 OST Part.2,2015.05.11,177245,2015,5,11,2,0
279,6,우연히 봄,"로꼬, 유주 (YUJU)",냄새를 보는 소녀 OST Part.2,2015.05.18,177245,2015,5,18,3,0


In [136]:
winter_filtered_df = pd.DataFrame()
for index,row in winter_grouped.iterrows():
    match_df = data[(data['title']==row['title'])&(data['singer']==row['singer'])]
    winter_filtered_df = pd.concat([winter_filtered_df,match_df],ignore_index=True)
winter_filtered_df

Unnamed: 0,rank,title,singer,album,date,like,year,month,day,week,season
0,94,All I Want for Christmas Is You,Mariah Carey,Merry Christmas (Deluxe Anniversary Edition),2024.01.01,236566,2023,1,1,1,1
1,93,All I Want for Christmas Is You,Mariah Carey,Merry Christmas (Deluxe Anniversary Edition),2023.01.02,236566,2022,1,2,1,1
2,24,All I Want for Christmas Is You,Mariah Carey,Merry Christmas (Deluxe Anniversary Edition),2023.12.04,236566,2023,12,4,1,1
3,14,All I Want for Christmas Is You,Mariah Carey,Merry Christmas (Deluxe Anniversary Edition),2023.12.11,236566,2023,12,11,2,1
4,5,All I Want for Christmas Is You,Mariah Carey,Merry Christmas (Deluxe Anniversary Edition),2023.12.18,236566,2023,12,18,3,1
...,...,...,...,...,...,...,...,...,...,...,...
478,27,크리스마스니까,"성시경, 박효신, 이석훈, 서인국, VIXX (빅스)",Jelly Christmas 2012 HEART PROJECT,2013.12.23,171261,2013,12,23,4,1
479,10,크리스마스니까,"성시경, 박효신, 이석훈, 서인국, VIXX (빅스)",Jelly Christmas 2012 HEART PROJECT,2012.12.03,171261,2012,12,3,1,1
480,2,크리스마스니까,"성시경, 박효신, 이석훈, 서인국, VIXX (빅스)",Jelly Christmas 2012 HEART PROJECT,2012.12.10,171261,2012,12,10,2,1
481,3,크리스마스니까,"성시경, 박효신, 이석훈, 서인국, VIXX (빅스)",Jelly Christmas 2012 HEART PROJECT,2012.12.17,171261,2012,12,17,3,1


In [148]:
pd.concat([spring_filtered_df,winter_filtered_df],ignore_index=True,axis=0).sort_values('date').to_csv("./csv_file/chart_in_mt3_all.csv",index=False)