In [1]:
# -*- coding: utf-8 -*-
import datetime

now = datetime.datetime.now()

# 신경 안 쓰셔도 됩니다.
# html 정보 가져오기 및 headers 세팅
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
            'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}

# 입출력 외에는 신경 안 쓰셔도 됩니다.
# 입력: 페이지 url의 끝부분의 page_number
# 출력: 페이지 내부에 대한 정보들을 컴퓨터가 읽기 좋은 형태

# 아래의 get_news_data_with_date 에서 사용할 함수임.
def get_url_info(page_number):
    if page_number==1:
        url = "https://www.investing.com/crypto/bitcoin/news"
    else:
        url = f"https://www.investing.com/crypto/bitcoin/news/{page_number}"
    response = requests.get(url,headers=headers)
    return BeautifulSoup(response.text,'lxml',from_encoding='utf-8')

import requests
from bs4 import BeautifulSoup
# 입출력 외에는 신경 안 쓰셔도 됩니다.
# 입력: 
# 1. page_number: 수집할 마지막 날짜를 포함하는 페이지의 번호를 매개변수로 갖는다.
# 2. first_date: 포함시킬 날짜에서 제일 과거 날짜의 [ 전날 ] 을 'Mar 14, 2022'형식으로 입력합니다.
# 3. exculde_date: 크롤링할 첫 페이지에 원하지 않는 날짜가 포함하지 않기 위해, [ 포함시킬 날짜의 마지막 날짜의 바로 다음 날 ]을 'Jun 16, 2022'형식으로 입력합니다.
# 출력:
# 뉴스 날짜 : [뉴스 기사 제목1,뉴스 기사 제목2,뉴스 기사 제목3, ... ,뉴스 기사 제목n] 형식으로 이루어진 dictionary 데이터

month = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
# ex: 'Dec':12 'Jan':01
month_dic = {m:str(i+1).zfill(2) for i,m in enumerate(month)}
# ex: 12:'Dec' 01:'Jan'
month_dic_reverse = {str(i+1).zfill(2):m for i,m in enumerate(month)}

def get_news_data_with_date(page_number:int,first_date:str,exculde_date:str):
    date_error = '\nERROR: Date is not in proper form.\n'
    if first_date[:3] not in month or exculde_date[:3] not in month:
        raise (date_error)
    elif len(first_date) != 12 or first_date[3] != ' ' or first_date[6:8] != ', ' or len(exculde_date) != 12 or exculde_date[3] != ' ' or exculde_date[6:8] != ', ':
        raise (date_error)
    
    finish=False
    news_date_title_dictionary = {}

    for page_num in range(page_number,9999999999):
        if page_num % 50 == 0 or page_num == 1:
            print(page_num)
        soup = get_url_info(page_num)
        news_title_list, news_date_list=[], []
        for i,title in enumerate(soup.select(".title")):
            title_string = title.get('title')
            if title_string != None:
                news_title_list.append(title_string)
        for date in soup.select(".date"):
            date_string = str(date.get_text())
            if exculde_date != date_string[3:]:
                if (first_date) == date_string[3:]:
                    finish = True
                else:
                    news_date_list.append(date_string[3:])
            
        for date, title in zip(news_date_list, news_title_list):
            if date in news_date_title_dictionary:
                news_date_title_dictionary[date].append(title)
            else:
                news_date_title_dictionary[date]=[title]
        if finish:
            break
    return news_date_title_dictionary

# 입력: investing.com date 형식 -> 출력: upbit date 형식으로 변환
# 입력: upbit date 형식 -> 출력: investing.com date 형식으로 변환
def convert_date_format(string:str):
    # 입력: investing.com date 형식 -> 출력: Upbit date 형식으로 변환
    if 'hour ago' in string or 'minutes ago' in string:
            ago_date = str(now - datetime.timedelta(hours=1))
            print(ago_date[:10] + " 09:00:00")
            return ago_date[:10] + " 09:00:00"
    elif 'hours ago' in string:
        ago_date = str(now - datetime.timedelta(hours=int(string[:2]) if string[1].isdigit() else int(string[:1])))
        return ago_date[:10] + " 09:00:00"
    elif string[0].isalpha:
        return string[8:]+"-"+month_dic[string[:3]]+"-"+string[4:6]+" 09:00:00"
    
    # 입력: upbit date 형식 -> 출력: investing.com date 형식으로 변환
    elif string[0].isdigit():
        return month_dic_reverse[string[5:7]]+" "+string[8:10]+", "+string[:4]
    return string

In [2]:
# 신경 안 쓰셔도 됩니다.
# 감정 분석에 쓸 도구 구성하기 입니다.
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# 입출력 외에는 신경 안 쓰셔도 됩니다.
# 입력: string: 감정 분석하고자 하는 문장
# 출력: 감정 분석 결과 1~5 -> 클수록 긍정적인 문장
def get_sentiment(string:str):
    tokens = tokenizer.encode(string,return_tensors='pt')
    result = model(tokens)
    return (int(torch.argmax(result.logits)) + 1)

In [3]:
news_sentiment_analysis = pd.DataFrame({'Date':[],'News title':[],'Open price':[],'High price':[],'Low price':[],'Close price':[],'Volume':[],'Sentiment':[]})
news_sentiment_analysis

Unnamed: 0,Date,News title,Open price,High price,Low price,Close price,Volume,Sentiment


In [4]:
# 수집할 마지막 날짜를 포함하는 페이지의 번호를 입력 / 1로 입력하면 가장 최근 뉴스 제목부터 수집됨.
page_number = 1
# 양 끝 날짜 입력
the_day_before_first_date = 'Dec 31, 2017'
the_day_after_latest_date = 'May 01, 2022'
news_date_title_dictionary = get_news_data_with_date(page_number,the_day_before_first_date,the_day_after_latest_date)

1




50
100


In [10]:
import pyupbit
ticker = 'KRW-BTC'
interval = 'day'
to = 'Jul 01, 2023'
count = 2000
upbit_info = pyupbit.get_ohlcv(ticker=ticker,interval=interval,to=to,count=count)
display(upbit_info)

Unnamed: 0,open,high,low,close,volume,value
2017-09-25 09:00:00,4201000.0,4333000.0,4175000.0,4322000.0,132.484755,5.602146e+08
2017-09-26 09:00:00,4317000.0,4418000.0,4311000.0,4321000.0,22.788340,9.950724e+07
2017-09-27 09:00:00,4322000.0,4677000.0,4318000.0,4657000.0,32.269662,1.448276e+08
2017-09-28 09:00:00,4657000.0,4772000.0,4519000.0,4586000.0,80.588243,3.721860e+08
2017-09-29 09:00:00,4586000.0,4709000.0,4476000.0,4657000.0,59.352373,2.724558e+08
...,...,...,...,...,...,...
2022-07-17 09:00:00,27856000.0,28481000.0,27453000.0,27505000.0,6347.366046,1.777696e+11
2022-07-18 09:00:00,27505000.0,29613000.0,27450000.0,29322000.0,13859.745278,3.983009e+11
2022-07-19 09:00:00,29322000.0,31050000.0,28476000.0,30660000.0,14595.947856,4.322420e+11
2022-07-20 09:00:00,30646000.0,31831000.0,30128000.0,30505000.0,12434.443757,3.852729e+11


In [11]:
cnt = 0
for i,(date,news_titles) in enumerate(news_date_title_dictionary.items()):
    date_converted = convert_date_format(date)
    price_data = upbit_info.loc[date_converted]
    for j,news_title in enumerate(news_titles):
        sentiment = get_sentiment(news_title)
        news_sentiment_analysis.loc[cnt] = [date_converted,news_title,price_data['open'],price_data['high'],price_data['low'],price_data['close'],price_data['volume'],sentiment]
        cnt += 1
display(news_sentiment_analysis)

2022-07-21 09:00:00


Unnamed: 0,Date,News title,Open price,High price,Low price,Close price,Volume,Sentiment
0,2022-07-21 09:00:00,Microsoft Teams back up for most users after g...,30500000.0,30849000.0,29710000.0,29912000.0,6609.956700,1
1,2022-07-21 09:00:00,"Stocks, euro steady ahead of ECB, crude tumbles",30500000.0,30849000.0,29710000.0,29912000.0,6609.956700,5
2,2022-07-21 09:00:00,"Russian gas flows to Germany again, Moscow eye...",30500000.0,30849000.0,29710000.0,29912000.0,6609.956700,1
3,2022-07-21 09:00:00,"Chart Of The Day: S&P 500 To 3,200",30500000.0,30849000.0,29710000.0,29912000.0,6609.956700,5
4,2022-07-21 09:00:00,Chart Of The Day: Apple To $125,30500000.0,30849000.0,29710000.0,29912000.0,6609.956700,5
...,...,...,...,...,...,...,...,...
1921,2022-04-26 09:00:00,The Central African Republic reportedly passes...,50614000.0,51193000.0,48300000.0,48777000.0,4171.124098,1
1922,2022-04-26 09:00:00,Microsoft Teams back up for most users after g...,50614000.0,51193000.0,48300000.0,48777000.0,4171.124098,1
1923,2022-04-26 09:00:00,"Stocks, euro steady ahead of ECB, crude tumbles",50614000.0,51193000.0,48300000.0,48777000.0,4171.124098,5
1924,2022-04-26 09:00:00,"Russian gas flows to Germany again, Moscow eye...",50614000.0,51193000.0,48300000.0,48777000.0,4171.124098,1


In [14]:
news_sentiment_analysis.to_csv('Bitcoin_News_Title_Upbit_Price_Sentiment.csv',encoding="utf-8-sig")

# 날짜-감정 데이터셋 만들기

In [26]:
# 날짜-감정 데이터셋을 만들기 위한 변수들 초기화

date_sentiment = {}
get_sentiment_sum = 0
length_of_sentiment = 1
cnt = 0
before_news_date = news_sentiment_analysis.loc[cnt]['Date']

In [27]:
while(cnt != len(news_sentiment_analysis)):
    now_news_date = news_sentiment_analysis.loc[cnt]['Date']
    # 직전 뉴스의 날짜와 지금 뉴스의 날짜가 같으면 get_sentiment_sum 값 증가
    if before_news_date == now_news_date:
        get_sentiment_sum += news_sentiment_analysis.loc[cnt]['Sentiment']
        length_of_sentiment += 1
    # 다르다면, 값을 계산해서 집어넣고 변수들(before_news_date,get_sentiment_sum,length_of_sentiment) 초기화
    else:
        before_news_date = now_news_date
        try:
            date_sentiment[now_news_date] = get_sentiment_sum / length_of_sentiment
            get_sentiment_sum = 0
            length_of_sentiment = 0
        except:
            date_sentiment[now_news_date] = 0
    cnt += 1

In [28]:
df_date_sentiment = pd.DataFrame({'Date':[],'Sentiment':[]})

In [35]:
cnt = 1626
for date,sentiment_avg in date_sentiment.items():
    df_date_sentiment.loc[cnt] = [date,sentiment_avg]
    cnt -= 1

In [36]:
df_date_sentiment = df_date_sentiment.sort_values('Date')

In [37]:
df_date_sentiment.to_csv('Bitcoin_News_sentiment_data.csv',encoding="utf-8-sig")