### 라이브러리

In [1]:
import re
import pandas as pd

### 데이터 확인

In [2]:
data = pd.read_csv('./queen.csv')
data.head(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1568167503947235331,1568167503947235331,2022-09-09 14:50:42 India Standard Time,2022-09-09,14:50:42,530,963099618547589121,inpd_,In Professional Development,,...,,,,,,[],,,,
1,1568167501334253568,1568167501334253568,2022-09-09 14:50:42 India Standard Time,2022-09-09,14:50:42,530,26475981,ukpostbox,UK Postbox,,...,,,,,,[],,,,
2,1568167500134731776,1568167500134731776,2022-09-09 14:50:41 India Standard Time,2022-09-09,14:50:41,530,868028006610153472,brandminds,BRAND MINDS,,...,,,,,,[],,,,


In [3]:
data.columns  # 데이터 컬럼 확인

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

In [4]:
data.loc[0]  # 데이터 예시 확인

id                                               1568167503947235331
conversation_id                                  1568167503947235331
created_at                   2022-09-09 14:50:42 India Standard Time
date                                                      2022-09-09
time                                                        14:50:42
timezone                                                         530
user_id                                           963099618547589121
username                                                       inpd_
name                                     In Professional Development
place                                                            NaN
tweet              We at In Professional Development join with pe...
language                                                          en
mentions                                                          []
urls               ['https://www.inpd.co.uk/blog/a-statement-from...
photos             ['https://pbs.t

In [5]:
data.loc[:5]['tweet']  # 데이터 트윗 텍스트 확인

0    We at In Professional Development join with pe...
1    Join us in remembering Her Majesty Queen Eliza...
2    "When life seems hard, the courageous do not l...
3    We join the nation in mourning the death of He...
4    We are saddened by the death of Her Majesty Qu...
5    @Historia_Fotos Is Queen the Band but no the r...
Name: tweet, dtype: object

### 정형 데이터 전처리

In [6]:
data.info()  # 데이터 컬럼 결측치 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190325 entries, 0 to 190324
Data columns (total 36 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               190325 non-null  int64  
 1   conversation_id  190325 non-null  int64  
 2   created_at       190325 non-null  object 
 3   date             190325 non-null  object 
 4   time             190325 non-null  object 
 5   timezone         190325 non-null  int64  
 6   user_id          190325 non-null  int64  
 7   username         190325 non-null  object 
 8   name             190315 non-null  object 
 9   place            228 non-null     object 
 10  tweet            190325 non-null  object 
 11  language         190325 non-null  object 
 12  mentions         190325 non-null  object 
 13  urls             190325 non-null  object 
 14  photos           190325 non-null  object 
 15  replies_count    190325 non-null  int64  
 16  retweets_count   190325 non-null  int6

In [7]:
# 해당 컬럼 값이 모두 nan인 경우 드랍
all_nan_columns = ['near', 'geo', 'source', 'user_rt_id',
                   'user_rt', 'retweet_id', 'reply_to', 'retweet_date',
                   'translate', 'trans_src', 'trans_dest']
data = data.drop(columns=all_nan_columns)

### 트윗 텍스트 데이터 전처리

In [8]:
def preprocess_text(text):
    # () 괄호 안 제거
    text = re.sub(r'\([^)]*\)', '', text)
    # 유저명 제거
    text = re.sub(r'@[a-zA-Z0-0-_.]+', '', text)
    # 해시태그 제거
    text = re.sub(r'#[a-zA-Z0-9-.]+', '', text)
    # url 제거
    text = re.sub(r'https://t.co/[a-zA-Z0-9-.]+', '', text)
    return text.strip()

In [9]:
# 트윗 텍스트 데이터 전처리 예시
print('전처리 전')
print(data.loc[2]['tweet'])
print('\n')

print('전처리 후')
print(preprocess_text(data.loc[2]['tweet']))

전처리 전
"When life seems hard, the courageous do not lie down and accept defeat; instead, they are all the more determined to struggle for a better future."   - Queen Elizabeth II (1926-2022)   #queenelizabeth #worldchanger #leadership  https://t.co/74X7t8a3Ax


전처리 후
"When life seems hard, the courageous do not lie down and accept defeat; instead, they are all the more determined to struggle for a better future."   - Queen Elizabeth II


In [10]:
# 전체 데이터에서 트윗 텍스트 전처리
data['tweet'] = data['tweet'].apply(preprocess_text)

In [11]:
# 전처리 후 각 트윗 텍스트의 길이를 컬럼화
tweet_len = [len(data.loc[i]['tweet']) for i in range(len(data))]
data['tweet_len'] = tweet_len

In [12]:
data['tweet_len'].describe()  # 전처리 후 트윗 텍스트 길이 확인

count    190325.000000
mean        114.696409
std          77.931011
min           0.000000
25%          51.000000
50%          93.000000
75%         173.000000
max         354.000000
Name: tweet_len, dtype: float64

In [13]:
# 전처리 후 트윗 텍스트 길이가 전체 25% 이하일 경우 드랍
print(f'전처리 전 전체 데이터 개수는 {len(data)}개입니다.')

under25_idx = data[data['tweet_len'] <= 51].index
data = data.drop(under25_idx).reset_index()
data.to_csv('./queen_notshort.csv', index=False, encoding='utf-8-sig')
print(f'전처리 후 전체 데이터 개수는 {len(data)}개입니다.')

전처리 전 전체 데이터 개수는 190325개입니다.
전처리 후 전체 데이터 개수는 142348개입니다.
