# 텍스트 크리닝
컴퓨터가 자연어를 이해하고 처리하게 하는 것이 가장 어려운 작업중 하나이다. 언어는 원래 분명한 의미를 전달하지 못하는 경우가 많다. 같은 단어도 상황에 따라서 전혀 다른 의미로 해석되기도 한다. 컴퓨터는 잘 구조화된, 잘 정의된, 의미가 분명한 데이터를 처리하데는 우수하지만 텍스트나 음성을 처리하는데는 사람에 비해 능력이 매우 부족하다.

자연어를 컴퓨터가 이해하고 처리할 수 있게 바꾸는 작업은 생각보다 매우 어렵고 복잡한 작업이 되기 쉽다.

이 예에서는 텍스트 데이터를 분석하기 좋게 변경하는 단계들을 다루겠다. 앞에서 소개한 트위터 데이터 수집 프로그램에서 얻은 트위터 데이터를 사용하겠다.

In [2]:
from data.load_tweets import load_tweets
data = load_tweets(overwrite=False)
print(data['text'])

0      Better find an Apple Inc. Store right quick lo...
1      RT @TropDontTweet: People soley into death met...
2      If you follow $AAPL This is a must join site! ...
3      RT @ValaAfshar: Steve Jobs talking about what ...
4      RT @ValaAfshar: If you are doing great work, i...
5      Steve Jobs talking about what happens when mar...
6      RT @LifeNewsHQ: Steve Jobs Was Glad He Didn’t ...
7      RT @TropDontTweet: People soley into death met...
8      "Steve Jobs was a dick" I type on twitter from...
9      RT @ValaAfshar: If you are doing great work, i...
10     Watching "Steve Jobs"\n \nI think this is Matt...
11     If you follow $WNR Share your opinions here --...
12     #ZinUrban for the follow #CLOUDUNLOCK Check Ou...
13     RT @TropDontTweet: People soley into death met...
14     People who think they are crazy enough to chan...
15     RT @ivantaged: Steve Jobs takes on Pasadena. I...
16     RT @ValaAfshar: If you are doing great work, i...
17     RT @SABAHtwt: Your time 

우선, 본문에서 중복되는 부분을 모두 제거하겠다.

In [7]:
texts = set(data['text'])
print("{} -> {} tweets".format(len(data['text'].values), len(texts)))

286 -> 252 tweets


이제 URL 내용을 "URL" 객체에 담겠다. 이를 위해서 regular expression 라이브러리가 필요하다

In [5]:
# re는 파이선에서 regular expressions을 사용하기 위한 라이브러리이다

import re
# regular expressions을 다루는 작업은 다소 복잡하다
# 이를 이해하기 위해서 regular expression 확인을 온라인으로 하는 사이트가 있다
# 참고 사이트 https://regex101.com/ 
texts = [re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', 'URL', text, flags=re.MULTILINE) for text in texts]

# 링크가 있는 곳이면 "URL" 단어를 볼 수 있을 것이다
print(texts)

['Dank Kids S01E01 via @9gag URL', "RT @ShaylaRacquel: Yesterday my Aunt (70) went through my Uncle (74) ipad &amp; found a white woman's nudes. Thanksgiving dinner went UP... on…", 'Apple iPhone 5S Silver 64GB Unlocked Smartphone (Certified Refurbished) URL $AAPL #apple #iphone5s', 'RT @thesuccesstalk: Innovation distinguishes between a leader and a follower. -Steve Jobs', 'I liked a @YouTube video from @tiktakdraw URL STEVE JOBS | Draw My Life En Español', 'RT @9to5mac: VLC media player preview brings 360-degree video playback to the Mac, iOS support expected next year URL h…', 'Why did the US President not award Steve Jobs and Elon Musk the Presidential Medal of Freedom?   URL', "Check out my custom truck in #MMXRacing for iOS. Join me, it's free! URL URL", 'You can get an iPad for under $200 during Black Friday URL', 'I`m convinced about half of what separates the successful entrepreneurs from the non-successful ones is pure perseverance Steve Jobs', '【My cutie devil】\u3000 #mycuti

같은 작업을 트위터 usernames에 대해서 수행하겠다.

In [43]:
# regex taken from http://shahmirj.com/blog/extracting-twitter-usertags-using-regex
texts = [re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\\.]))@([A-Za-z]+[A-Za-z0-9_]+)', 'USER', text, flags=re.MULTILINE) for text in texts]
print(texts)

['Microsoft loves #mobile, especially #iOS and #Android: URL', 'Apple to replace dodgy iPhone 6s batteries URL', 'USER Thanks for following us! The app is now live on iOS at URL  &amp; Android at URL.', "Buyers' guide: Choosing the perfect MacBook model from Apple's numerous new offerings URL", 'Again USER new email upgrade horrible.  Thread confusing. not clear there is an attachment unless one scrolls thru thread #iphone #iOS10', '#Apple Touch Bar MacBook Pro Orders Begin Shipping to European Customers. Read more: URL $AAPL', 'USER Great tweet. Ive been watching $PSID closely. Its the hottest new #biotech stock right now \n$AAPL $SPY $WRAP $SRPT', 'Finding you discounts on Cardiff food deals! Download our IPhone App URL #tastebud', '#Apple Will Fix Troubled IPhone 6s Units. Read more: URL $AAPL', 'MacBook Pro/Air Sleeves / Cover Shop Etsy | URL | #MacBookProSleeve #MacBookProCover #MacBookProLEather #MacBookAIR', 'USER Hey Nate, first time tweeting. My son is a Type 1 and we are look

영어에서 축약 표현 (she will -> she'll 등)은 매우 자주 사용된다. 이러한 축약 표현을 원래의 표현으로 바꾸는 작업이 필요하다.

In [8]:
def collapse_contractions(text):
    # 영어의 축약표현은 다음의 사이트에서 확인할 수 있다
    # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
    collapsable_contractions = {'she will have': "she'll've", 'shall not': "shan't", 'it would': "it'd", 'ought not have': "oughtn't've", 'where have': "where've", 'we would have': "we'd've", 'of the clock': "o'clock", 'where is': "where's", 'am not': "aren't", 'it would have': "it'd've", 'what has': "what's", 'had not': "hadn't", 'what is': "what's", 'you shall': "you'll", 'I will': "I'll", 'might not have': "mightn't've", 'why is': "why's", 'they had': "they'd", 'you have': "you've", 'you all would': "y'all'd", 'he shall': "he'll", 'should not have': "shouldn't've", 'he is': "he's", 'so have': "so've", 'would not have': "wouldn't've", 'he will': "he'll", 'she would have': "she'd've", 'you will': "you'll", 'when is': "when's", 'so as': "so's", 'it shall have': "it'll've", 'who will': "who'll", 'there would': "there'd", 'she is': "she's", 'that had': "that'd", 'must have': "must've", 'would not': "wouldn't", 'madam': "ma'am", 'they will': "they'll", 'he would have': "he'd've", 'they will have': "they'll've", 'to have': "to've", 'could not have': "couldn't've", 'she has': "she's", 'must not have': "mustn't've", 'do not': "don't", 'that would': "that'd", 'need not': "needn't", 'cannot': "can't", 'you shall have': "you'll've", 'you would': "you'd", 'who shall': "who'll", 'you all would have': "y'all'd've", 'I shall': "I'll", 'we are': "we're", 'why has': "why's", 'she shall': "she'll", 'I have': "I've", 'what will have': "what'll've", 'it will have': "it'll've", 'have not': "haven't", 'what have': "what've", 'was not': "wasn't", 'could not': "couldn't", 'why have': "why've", 'I had': "I'd", 'I shall have': "I'll've", 'is not': "isn't", 'when has': "when's", 'that is': "that's", 'should have': "should've", 'what are': "what're", 'he has': "he's", 'might not': "mightn't", 'it will': "it'll", 'you had': "you'd", 'there is': "there's", 'what shall': "what'll", 'because': "'cause", 'he had': "he'd", 'I would': "I'd", 'I would have': "I'd've", 'he shall have': "he'll've", 'you will have': "you'll've", 'may not': "mayn't", 'where has': "where's", 'what shall have': "what'll've", 'where did': "where'd", 'we will have': "we'll've", 'we have': "we've", 'must not': "mustn't", 'he will have': "he'll've", 'who have': "who've", 'there has': "there's", 'cannot have': "can't've", 'shall not have': "shan't've", 'ought not': "oughtn't", 'how do you': "how'd'y", 'how does': "how's", 'should not': "shouldn't", 'are not': "aren't", 'you all have': "y'all've", 'does not': "doesn't", 'you are': "you're", 'how has': "how's", 'will have': "will've", 'there had': "there'd", 'let us': "let's", 'she had': "she'd", 'they would have': "they'd've", 'has not': "hasn't", 'who will have': "who'll've", 'did not': "didn't", 'who is': "who's", 'we had': "we'd", 'they have': "they've", 'he would': "he'd", 'so is': "so's", 'she will': "she'll", 'how is': "how's", 'how will': "how'll", 'you all': "y'all", 'they would': "they'd", 'they shall have': "they'll've", 'I am': "I'm", 'how did': "how'd", 'it had': "it'd", 'might have': "might've", 'were not': "weren't", 'will not have': "won't've", 'you all are': "y'all're", 'she shall have': "she'll've", 'she would': "she'd", 'who shall have': "who'll've", 'could have': "could've", 'it has': "it's", 'that has': "that's", 'who has': "who's", 'we would': "we'd", 'they shall': "they'll", 'that would have': "that'd've", 'you would have': "you'd've", 'they are': "they're", 'it is': "it's", 'had not have': "hadn't've", 'when have': "when've", 'need not have': "needn't've", 'it shall': "it'll", 'we will': "we'll", 'would have': "would've", 'what will': "what'll", 'I will have': "I'll've", 'there would have': "there'd've", 'will not': "won't"}
    for key in sorted(collapsable_contractions, key=len, reverse=True): # collapse phrases into contractions in order of length of phrase so "she would have" becomes "she'd've" instead of "she'd have"
        text = text.replace(key, collapsable_contractions[key])
    return text

texts = [collapse_contractions(text) for text in texts]
print(texts)

['Dank Kids S01E01 via @9gag https://t.co/3zVAb9hYfz', "RT @ShaylaRacquel: Yesterday my Aunt (70) went through my Uncle (74) ipad &amp; found a white woman's nudes. Thanksgiving dinner went UP... on…", 'Apple iPhone 5S Silver 64GB Unlocked Smartphone (Certified Refurbished) https://t.co/JOucB6U2so $AAPL #apple #iphone5s', 'RT @thesuccesstalk: Innovation distinguishes between a leader and a follower. -Steve Jobs', 'I liked a @YouTube video from @tiktakdraw https://t.co/WprbvGAIsd STEVE JOBS | Draw My Life En Español', 'RT @9to5mac: VLC media player preview brings 360-degree video playback to the Mac, iOS support expected next year https://t.co/tewV3UI0Mv h…', 'Why did the US President not award Steve Jobs and Elon Musk the Presidential Medal of Freedom?   https://t.co/xJTP6dSYOw', "Check out my custom truck in #MMXRacing for iOS. Join me, it's free! https://t.co/JFfAL2OnwV https://t.co/4cRd7fP5g9", 'You can get an iPad for under $200 during Black Friday https://t.co/zxNn7ixSyT', 'I`m co

In [44]:
def collapse_contractions(text):
    # 영어의 축약표현은 다음의 사이트에서 확인할 수 있다
    # https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
    collapsable_contractions = {'she will have': "she'll've", 'shall not': "shan't", 'it would': "it'd", 'ought not have': "oughtn't've", 'where have': "where've", 'we would have': "we'd've", 'of the clock': "o'clock", 'where is': "where's", 'am not': "aren't", 'it would have': "it'd've", 'what has': "what's", 'had not': "hadn't", 'what is': "what's", 'you shall': "you'll", 'I will': "I'll", 'might not have': "mightn't've", 'why is': "why's", 'they had': "they'd", 'you have': "you've", 'you all would': "y'all'd", 'he shall': "he'll", 'should not have': "shouldn't've", 'he is': "he's", 'so have': "so've", 'would not have': "wouldn't've", 'he will': "he'll", 'she would have': "she'd've", 'you will': "you'll", 'when is': "when's", 'so as': "so's", 'it shall have': "it'll've", 'who will': "who'll", 'there would': "there'd", 'she is': "she's", 'that had': "that'd", 'must have': "must've", 'would not': "wouldn't", 'madam': "ma'am", 'they will': "they'll", 'he would have': "he'd've", 'they will have': "they'll've", 'to have': "to've", 'could not have': "couldn't've", 'she has': "she's", 'must not have': "mustn't've", 'do not': "don't", 'that would': "that'd", 'need not': "needn't", 'cannot': "can't", 'you shall have': "you'll've", 'you would': "you'd", 'who shall': "who'll", 'you all would have': "y'all'd've", 'I shall': "I'll", 'we are': "we're", 'why has': "why's", 'she shall': "she'll", 'I have': "I've", 'what will have': "what'll've", 'it will have': "it'll've", 'have not': "haven't", 'what have': "what've", 'was not': "wasn't", 'could not': "couldn't", 'why have': "why've", 'I had': "I'd", 'I shall have': "I'll've", 'is not': "isn't", 'when has': "when's", 'that is': "that's", 'should have': "should've", 'what are': "what're", 'he has': "he's", 'might not': "mightn't", 'it will': "it'll", 'you had': "you'd", 'there is': "there's", 'what shall': "what'll", 'because': "'cause", 'he had': "he'd", 'I would': "I'd", 'I would have': "I'd've", 'he shall have': "he'll've", 'you will have': "you'll've", 'may not': "mayn't", 'where has': "where's", 'what shall have': "what'll've", 'where did': "where'd", 'we will have': "we'll've", 'we have': "we've", 'must not': "mustn't", 'he will have': "he'll've", 'who have': "who've", 'there has': "there's", 'cannot have': "can't've", 'shall not have': "shan't've", 'ought not': "oughtn't", 'how do you': "how'd'y", 'how does': "how's", 'should not': "shouldn't", 'are not': "aren't", 'you all have': "y'all've", 'does not': "doesn't", 'you are': "you're", 'how has': "how's", 'will have': "will've", 'there had': "there'd", 'let us': "let's", 'she had': "she'd", 'they would have': "they'd've", 'has not': "hasn't", 'who will have': "who'll've", 'did not': "didn't", 'who is': "who's", 'we had': "we'd", 'they have': "they've", 'he would': "he'd", 'so is': "so's", 'she will': "she'll", 'how is': "how's", 'how will': "how'll", 'you all': "y'all", 'they would': "they'd", 'they shall have': "they'll've", 'I am': "I'm", 'how did': "how'd", 'it had': "it'd", 'might have': "might've", 'were not': "weren't", 'will not have': "won't've", 'you all are': "y'all're", 'she shall have': "she'll've", 'she would': "she'd", 'who shall have': "who'll've", 'could have': "could've", 'it has': "it's", 'that has': "that's", 'who has': "who's", 'we would': "we'd", 'they shall': "they'll", 'that would have': "that'd've", 'you would have': "you'd've", 'they are': "they're", 'it is': "it's", 'had not have': "hadn't've", 'when have': "when've", 'need not have': "needn't've", 'it shall': "it'll", 'we will': "we'll", 'would have': "would've", 'what will': "what'll", 'I will have': "I'll've", 'there would have': "there'd've", 'will not': "won't"}
    for key in sorted(collapsable_contractions, key=len, reverse=True): # collapse phrases into contractions in order of length of phrase so "she would have" becomes "she'd've" instead of "she'd have"
        text = text.replace(key, collapsable_contractions[key])
    return text

texts = [collapse_contractions(text) for text in texts]
print(texts)

['Microsoft loves #mobile, especially #iOS and #Android: URL', 'Apple to replace dodgy iPhone 6s batteries URL', 'USER Thanks for following us! The app is now live on iOS at URL  &amp; Android at URL.', "Buyers' guide: Choosing the perfect MacBook model from Apple's numerous new offerings URL", "Again USER new email upgrade horrible.  Thread confusing. not clear there's an attachment unless one scrolls thru thread #iphone #iOS10", '#Apple Touch Bar MacBook Pro Orders Begin Shipping to European Customers. Read more: URL $AAPL', 'USER Great tweet. Ive been watching $PSID closely. Its the hottest new #biotech stock right now \n$AAPL $SPY $WRAP $SRPT', 'Finding you discounts on Cardiff food deals! Download our IPhone App URL #tastebud', '#Apple Will Fix Troubled IPhone 6s Units. Read more: URL $AAPL', 'MacBook Pro/Air Sleeves / Cover Shop Etsy | URL | #MacBookProSleeve #MacBookProCover #MacBookProLEather #MacBookAIR', "USER Hey Nate, first time tweeting. My son is a Type 1 and we're lookin

마침문자들 (punctuation)을 제거하되 '#' 나 '$'는 제거하지 않는다. 이는 트위터에서 특별한 목적으로 사용되는 문자들이다.

In [45]:
import string

punctuation = string.punctuation
punctuation = punctuation.replace("#", "")
punctuation = punctuation.replace("$", "")
texts = [text.translate({ord(i):None for i in punctuation}) for text in texts]
print(texts)

['Microsoft loves #mobile especially #iOS and #Android URL', 'Apple to replace dodgy iPhone 6s batteries URL', 'USER Thanks for following us The app is now live on iOS at URL  amp Android at URL', 'Buyers guide Choosing the perfect MacBook model from Apples numerous new offerings URL', 'Again USER new email upgrade horrible  Thread confusing not clear theres an attachment unless one scrolls thru thread #iphone #iOS10', '#Apple Touch Bar MacBook Pro Orders Begin Shipping to European Customers Read more URL $AAPL', 'USER Great tweet Ive been watching $PSID closely Its the hottest new #biotech stock right now \n$AAPL $SPY $WRAP $SRPT', 'Finding you discounts on Cardiff food deals Download our IPhone App URL #tastebud', '#Apple Will Fix Troubled IPhone 6s Units Read more URL $AAPL', 'MacBook ProAir Sleeves  Cover Shop Etsy  URL  #MacBookProSleeve #MacBookProCover #MacBookProLEather #MacBookAIR', 'USER Hey Nate first time tweeting My son is a Type 1 and were looking into getting him an Appl

이제 텍스트를 단아들로 구분하고 단어들을 명료한 단어로 정리한다.

In [49]:
# 단어로 분리하고 정리한다
cleantexts = []
for text in texts:
    words = text.split(" ")
    words = [word.lower() for word in words]
    words = [word.strip() for word in words]
    words = [word for word in words if len(word) > 0]
    cleantext = " ".join(words)
    cleantexts.append(cleantext)

texts = cleantexts
print(texts)

['microsoft loves #mobile especially #ios and #android url', 'apple to replace dodgy iphone 6s batteries url', 'user thanks for following us the app is now live on ios at url amp android at url', 'buyers guide choosing the perfect macbook model from apples numerous new offerings url', 'again user new email upgrade horrible thread confusing not clear theres an attachment unless one scrolls thru thread #iphone #ios10', '#apple touch bar macbook pro orders begin shipping to european customers read more url $aapl', 'user great tweet ive been watching $psid closely its the hottest new #biotech stock right now $aapl $spy $wrap $srpt', 'finding you discounts on cardiff food deals download our iphone app url #tastebud', '#apple will fix troubled iphone 6s units read more url $aapl', 'macbook proair sleeves cover shop etsy url #macbookprosleeve #macbookprocover #macbookproleather #macbookair', 'user hey nate first time tweeting my son is a type 1 and were looking into getting him an apple watch

이제 명료하게 정리된 텍스트 (clear text)를 가지고 처리할 수 있는 작업을 수행할 수 있다. 예를 들어 인터넷에서 널리 사용되는 단축어를 원래 단어로 교체할 수 있다 ("b4"를 "before"로 교체 등). 모든 숫자들을 특정 저장소인 "NUM"에 저장해둘 수 있다. ASCII 코드가 아닌 문자들을 제거할 수도 있다. 마침문자들을 다른 기능을 하도록 변환할 수도 있다. 마침문자를 사용하여 텍스트를 단위문장이나 패러그라프로 나눌 수도 있다. 'the'와 같이 의미 없는 단어들을 모두 없앨 수 있다. 모든 단어를 변형 문자가 아닌 원형단어로 교체할 수 있다. (어미 변화를 없애는 등)

텍스트 데이터를 어떻게 처리하는지는 분석의 목적과 상황에 따라 다르게 처리할 수 있다. 일단 텍스트 파일을 명료한 문서로 바꾸는 것으로부터 텍스트 분석이 시작된다고 할 수 있다. 아래에 정리한 cleantext.py 프로그램에는 위에서 설명한 여러가지 작업들을 처리하는 기능을 포함하고 있으며 다음과 같이 필요한 함수를 호출할 수 있다.

`cleantext = clean(text)`. 

In [52]:
from re import sub as rsub
import string

def clean(text):
    # URL과 usernames을 찾는다
    cleantext = rsub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', 'URL', text, flags=re.MULTILINE)
    cleantext = rsub(r'(?<=^|(?<=[^a-zA-Z0-9-_\\.]))@([A-Za-z]+[A-Za-z0-9_]+)', 'USER', cleantext, flags=re.MULTILINE)
    
    # 영어의 축약어 처리
    # 참고 사이트 https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions
    collapsable_contractions = {'she will have': "she'll've", 'shall not': "shan't", 'it would': "it'd", 'ought not have': "oughtn't've", 'where have': "where've", 'we would have': "we'd've", 'of the clock': "o'clock", 'where is': "where's", 'am not': "aren't", 'it would have': "it'd've", 'what has': "what's", 'had not': "hadn't", 'what is': "what's", 'you shall': "you'll", 'I will': "I'll", 'might not have': "mightn't've", 'why is': "why's", 'they had': "they'd", 'you have': "you've", 'you all would': "y'all'd", 'he shall': "he'll", 'should not have': "shouldn't've", 'he is': "he's", 'so have': "so've", 'would not have': "wouldn't've", 'he will': "he'll", 'she would have': "she'd've", 'you will': "you'll", 'when is': "when's", 'so as': "so's", 'it shall have': "it'll've", 'who will': "who'll", 'there would': "there'd", 'she is': "she's", 'that had': "that'd", 'must have': "must've", 'would not': "wouldn't", 'madam': "ma'am", 'they will': "they'll", 'he would have': "he'd've", 'they will have': "they'll've", 'to have': "to've", 'could not have': "couldn't've", 'she has': "she's", 'must not have': "mustn't've", 'do not': "don't", 'that would': "that'd", 'need not': "needn't", 'cannot': "can't", 'you shall have': "you'll've", 'you would': "you'd", 'who shall': "who'll", 'you all would have': "y'all'd've", 'I shall': "I'll", 'we are': "we're", 'why has': "why's", 'she shall': "she'll", 'I have': "I've", 'what will have': "what'll've", 'it will have': "it'll've", 'have not': "haven't", 'what have': "what've", 'was not': "wasn't", 'could not': "couldn't", 'why have': "why've", 'I had': "I'd", 'I shall have': "I'll've", 'is not': "isn't", 'when has': "when's", 'that is': "that's", 'should have': "should've", 'what are': "what're", 'he has': "he's", 'might not': "mightn't", 'it will': "it'll", 'you had': "you'd", 'there is': "there's", 'what shall': "what'll", 'because': "'cause", 'he had': "he'd", 'I would': "I'd", 'I would have': "I'd've", 'he shall have': "he'll've", 'you will have': "you'll've", 'may not': "mayn't", 'where has': "where's", 'what shall have': "what'll've", 'where did': "where'd", 'we will have': "we'll've", 'we have': "we've", 'must not': "mustn't", 'he will have': "he'll've", 'who have': "who've", 'there has': "there's", 'cannot have': "can't've", 'shall not have': "shan't've", 'ought not': "oughtn't", 'how do you': "how'd'y", 'how does': "how's", 'should not': "shouldn't", 'are not': "aren't", 'you all have': "y'all've", 'does not': "doesn't", 'you are': "you're", 'how has': "how's", 'will have': "will've", 'there had': "there'd", 'let us': "let's", 'she had': "she'd", 'they would have': "they'd've", 'has not': "hasn't", 'who will have': "who'll've", 'did not': "didn't", 'who is': "who's", 'we had': "we'd", 'they have': "they've", 'he would': "he'd", 'so is': "so's", 'she will': "she'll", 'how is': "how's", 'how will': "how'll", 'you all': "y'all", 'they would': "they'd", 'they shall have': "they'll've", 'I am': "I'm", 'how did': "how'd", 'it had': "it'd", 'might have': "might've", 'were not': "weren't", 'will not have': "won't've", 'you all are': "y'all're", 'she shall have': "she'll've", 'she would': "she'd", 'who shall have': "who'll've", 'could have': "could've", 'it has': "it's", 'that has': "that's", 'who has': "who's", 'we would': "we'd", 'they shall': "they'll", 'that would have': "that'd've", 'you would have': "you'd've", 'they are': "they're", 'it is': "it's", 'had not have': "hadn't've", 'when have': "when've", 'need not have': "needn't've", 'it shall': "it'll", 'we will': "we'll", 'would have': "would've", 'what will': "what'll", 'I will have': "I'll've", 'there would have': "there'd've", 'will not': "won't"}
    for key in sorted(collapsable_contractions, key=len, reverse=True): # collapse phrases into contractions in order of length of phrase so "she would have" becomes "she'd've" instead of "she'd have"
        cleantext = cleantext.replace(key, collapsable_contractions[key])
    
    # 마침문자 처리
    punctuation = string.punctuation
    punctuation = punctuation.replace("#", "")
    punctuation = punctuation.replace("$", "")
    cleantext = cleantext.translate({ord(i):None for i in punctuation})
    
    # 단어를 명료한 단어로 처리
    words = cleantext.split(" ")
    words = [word.lower() for word in words]
    words = [word.strip() for word in words]
    words = [word for word in words if len(word) > 0]
    cleantext = " ".join(words)
    
    return cleantext

In [53]:
texts = set(data['text'])
cleantexts = [clean(text) for text in texts]
print(cleantexts)

['microsoft loves #mobile especially #ios and #android url', 'apple to replace dodgy iphone 6s batteries url', 'user thanks for following us the app is now live on ios at url amp android at url', 'buyers guide choosing the perfect macbook model from apples numerous new offerings url', 'again user new email upgrade horrible thread confusing not clear theres an attachment unless one scrolls thru thread #iphone #ios10', '#apple touch bar macbook pro orders begin shipping to european customers read more url $aapl', 'user great tweet ive been watching $psid closely its the hottest new #biotech stock right now $aapl $spy $wrap $srpt', 'finding you discounts on cardiff food deals download our iphone app url #tastebud', '#apple will fix troubled iphone 6s units read more url $aapl', 'macbook proair sleeves cover shop etsy url #macbookprosleeve #macbookprocover #macbookproleather #macbookair', 'user hey nate first time tweeting my son is a type 1 and were looking into getting him an apple watch