In [19]:
import pandas as pd

musk_df = pd.read_csv('musk.csv', index_col=0)
musk_df.sample(5)

Unnamed: 0,type,author,text,created
58284,Comment,nikatnight,Of course. This is an interesting thing that h...,2023-04-03 16:11:55+00:00
55666,Comment,Inevitable_Egg5341,I'd love to sleep in a 69 position with her la...,2023-03-31 11:56:14+00:00
69752,Comment,PlastikHateAccount,Karl Lauterbach hat eine Beziehung zu Deadline...,2023-04-15 17:23:36+00:00
79478,Comment,SporkTechRules,"""Paging Mr. Musk. Mr. Musk, please pick up the...",2023-04-26 17:59:08+00:00
14759,Comment,FeedMeACat,What do you mean? Musk playing hardball when t...,2023-02-15 18:39:51+00:00


In [20]:
musk_df['type'].value_counts()

Comment                      61620
2023-04-30 15:47:51+00:00        1
Name: type, dtype: int64

In [21]:
musk_df.drop('type', axis=1, inplace=True)

In [22]:
TEXTS_COL_NAME = 'text'

In [23]:
def remove_duplicates_and_na(df: pd.DataFrame) -> pd.DataFrame:
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    return df

musk_df_clean = remove_duplicates_and_na(musk_df)
musk_df.shape[0], musk_df_clean.shape[0]

(61619, 61619)

In [24]:
def to_lower(text: str) -> str:
    return text.lower()

musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(to_lower)
musk_df_clean[TEXTS_COL_NAME]

0        i had been on twitter prior to the musk takeov...
2        that article does not say what you imply at al...
3                       the og musk duck lives on my wall.
4        how dare he speak that way to the great and po...
5        can’t wait to finally have an excuse not to sh...
                               ...                        
85215    like brain dead piranhas lmao, anything barely...
85221    what am i lying about? what's my agenda? pleas...
85223    hard disagree. i don't think he's a parody of ...
85224    yeah. i think too many things are lining up ri...
85225    good choice by musk. it is impossible to regul...
Name: text, Length: 61619, dtype: object

In [25]:
from unidecode import unidecode

def replace_to_unicode(text:str) -> str:
    return unidecode(text)

musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(replace_to_unicode)

In [26]:
import re

def remove_urls(text: str) -> str:
   return re.sub(r"(www.[^\s]+)|(https?:\/\/[^\s]+)", "", text)

musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(remove_urls)

In [27]:
def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        print(type(text))
    matches = re.findall(r"[a-zA-Z\s']+", text)
    return ''.join(matches)

musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(normalize_text)
musk_df_clean.sample(10)

Unnamed: 0,author,text,created
30294,AbsAndAssAppreciator,i guess his childhood would be different so he...,2023-03-03 11:12:26+00:00
4032,Afraid-Contract-385,i know some people dislike it but i often use ...,2023-02-03 18:00:07+00:00
22884,Neil_is_me,elon musk can do basic math maybe that's why h...,2023-02-24 16:38:54+00:00
68514,I_smell_NORMIES,elon musk he's african so his name doesn't change,2023-04-13 15:35:24+00:00
10231,bigbadhank7,i bought in april last year i love the car and...,2023-02-09 17:00:17+00:00
29596,Matsapha,thanks for the link anyone who helps dissemin...,2023-03-03 18:01:36+00:00
45092,ergzay,elon isn't an emerald mine heir that's a myth,2023-03-21 18:31:17+00:00
1256,WeCanDoThisCNJ,in alone there were climaterelated disasters...,2023-01-31 17:15:47+00:00
41034,Eastern_Ad_4441,government spending is not the problem that is...,2023-03-14 12:07:09+00:00
4070,SmolKittenSlut,please rest it on my face i'll take in all you...,2023-02-03 17:49:27+00:00


In [28]:
import contractions

def replace_contractions(text: str) -> str:
    return contractions.fix(text)

musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(replace_contractions)
musk_df_clean.sample(10)

Unnamed: 0,author,text,created
17056,TimeIsTheMindOfSpace,elon musk is a megachurch pastor for atheistsd...,2023-02-17 16:29:34+00:00
2295,zlefin_actual,because he is a bad person doing a lot of bad ...,2023-02-01 16:27:46+00:00
25543,honeydew808,comma is in the wrong place in musk's reply,2023-02-26 09:37:24+00:00
60073,Chris_Hansen_AMA,yeah like pressure from china who musk despera...,2023-04-05 17:13:38+00:00
70879,Redd575,gt and i think twitter has never made a profit...,2023-04-16 15:56:56+00:00
41305,Rick_101,ok i get some of your points you could agree t...,2023-03-15 17:46:43+00:00
72360,orgngrndr01,i think though that the combination of new bat...,2023-04-17 14:26:33+00:00
51512,A-U-R-A,maybe they are considering advertising on twit...,2023-03-27 14:50:45+00:00
32498,GrandmaPoses,elon musk's childhood was a difficult one as h...,2023-03-06 18:22:33+00:00
16951,MrZFisher,musk said he was a right winger suspiciously b...,2023-02-17 17:07:08+00:00


In [29]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

@Language.factory('language_detector')
def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load('en_core_web_md', disable=['ner', 'parser'])
nlp.add_pipe('sentencizer')
nlp.add_pipe('language_detector')

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x7f24364e74f0>

In [30]:
musk_df_clean.shape[0]

61619

In [31]:
def remove_rows_with_non_english_texts(df: pd.DataFrame) -> pd.DataFrame:
    rows_with_eng_texts_indices = []
    for i, text in df.iterrows():
        doc = nlp(text[TEXTS_COL_NAME])
        if doc._.language['language'] == 'en':
            rows_with_eng_texts_indices.append(i)
    return df.loc[rows_with_eng_texts_indices]

musk_df_clean = remove_rows_with_non_english_texts(musk_df_clean)

In [32]:
musk_df_clean.shape[0]

55868

In [33]:
def remove_stop_words(text: str) -> str:
    text_without_stop_words = []
    for word in text.split(' '):
        if word not in nlp.Defaults.stop_words:
            text_without_stop_words.append(word)
    return " ".join(text_without_stop_words)

musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(remove_stop_words)

In [34]:
def lemmatize(text: str) -> str:
    text_lemmatized = []
    for word in nlp(text):
        text_lemmatized.append(word.lemma_)
    return " ".join(text_lemmatized)

musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(lemmatize)

In [35]:
musk_df_clean[TEXTS_COL_NAME] = musk_df_clean[TEXTS_COL_NAME].apply(str.split)

In [36]:
musk_df_clean.sample(10)

Unnamed: 0,author,text,created
50073,AresInRepose,"[hello, oh, lot, lovely, water, energy, you, c...",2023-03-26 18:02:04+00:00
85151,Superloopertive,"[hard, separate, grime, musk, 's, grossness, p...",2023-05-01 08:29:01+00:00
2976,Call_of_Tculhu,"[elon, musk, tom, brady, role, model, legion, ...",2023-02-02 18:39:51+00:00
80503,PM_ME_YO_TREE_FIDDY,"[damn, guy, lose, time, try, excuse, teslamusk...",2023-04-27 17:46:06+00:00
69941,N_Who,"[fucker, call, sixmonth, pause, ai, developmen...",2023-04-15 16:11:22+00:00
82404,RandolphE6,"[meet, odorous, guy, college, say, shower, bui...",2023-04-29 18:05:34+00:00
6302,plopop0,"[elon, musk, tesla, 's, cofounder, ceo, elon, ...",2023-02-05 15:49:35+00:00
81899,RiotDog1312,"[actually, listen, man, speak, fortune, entire...",2023-04-28 14:46:17+00:00
18125,MissThreepwood,"[ai, naughty, course, ai, uprise, musk, talk]",2023-02-18 16:02:41+00:00
32483,MirrorSauce,"[twitter, challenger, explosion, matter, elon,...",2023-03-06 18:27:43+00:00


In [37]:
musk_df_clean.to_csv('musk_clean.csv')