In [1]:
import re 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd

# Procesamiento

In [2]:
text='This tweet includes  4 mentions to users in the format @username, #hashtags in the format #hashtag, emojis such as 👋, 😌, 😎, 🎉, 😊, URLs like https://example.com/barbecue-recipe, and common punctuation characters like exclamation marks, question marks, and colons.'

In [3]:
text

'This tweet includes  4 mentions to users in the format @username, #hashtags in the format #hashtag, emojis such as 👋, 😌, 😎, 🎉, 😊, URLs like https://example.com/barbecue-recipe, and common punctuation characters like exclamation marks, question marks, and colons.'

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jcama\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

#### Stop Word Removal

In [10]:
stop_words = set(stopwords.words('english'))
[word for word in text.split() if word.lower() not in stop_words]

['tweet',
 'includes',
 '4',
 'mentions',
 'users',
 'format',
 '@username,',
 '#hashtags',
 'format',
 '#hashtag,',
 'emojis',
 '👋,',
 '😌,',
 '😎,',
 '🎉,',
 '😊,',
 'URLs',
 'like',
 'https://example.com/barbecue-recipe,',
 'common',
 'punctuation',
 'characters',
 'like',
 'exclamation',
 'marks,',
 'question',
 'marks,',
 'colons.']

####  Character word removal

In [11]:
re.sub(r'[^\w\s]', '', text)

'This tweet includes  4 mentions to users in the format username hashtags in the format hashtag emojis such as      URLs like httpsexamplecombarbecuerecipe and common punctuation characters like exclamation marks question marks and colons'

#### Lowercase

In [12]:
for i in text.split() :
    print(i.lower())


this
tweet
includes
4
mentions
to
users
in
the
format
@username,
#hashtags
in
the
format
#hashtag,
emojis
such
as
👋,
😌,
😎,
🎉,
😊,
urls
like
https://example.com/barbecue-recipe,
and
common
punctuation
characters
like
exclamation
marks,
question
marks,
and
colons.


#### Emoji Removal

In [13]:
emoji_pattern = re.compile("["u"\U0001F600-\U0001F64F"  # emoticons
                              u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                              u"\U0001F680-\U0001F6FF"  # transport & map symbols
                              u"\U0001F1E0-\U0001F1FF"  "]+", flags=re.UNICODE)


In [14]:
emoji_pattern.sub(r'', text)

'This tweet includes  4 mentions to users in the format @username, #hashtags in the format #hashtag, emojis such as , , , , , URLs like https://example.com/barbecue-recipe, and common punctuation characters like exclamation marks, question marks, and colons.'

#### Space removal

In [15]:
re.sub(r'\s+', ' ', text).strip()

'This tweet includes 4 mentions to users in the format @username, #hashtags in the format #hashtag, emojis such as 👋, 😌, 😎, 🎉, 😊, URLs like https://example.com/barbecue-recipe, and common punctuation characters like exclamation marks, question marks, and colons.'

#### Number removal

In [16]:
re.sub(r'\d+', '', text)

'This tweet includes   mentions to users in the format @username, #hashtags in the format #hashtag, emojis such as 👋, 😌, 😎, 🎉, 😊, URLs like https://example.com/barbecue-recipe, and common punctuation characters like exclamation marks, question marks, and colons.'

#### User Removal

In [17]:
re.sub(r'@\S+', '', text)

'This tweet includes  4 mentions to users in the format  #hashtags in the format #hashtag, emojis such as 👋, 😌, 😎, 🎉, 😊, URLs like https://example.com/barbecue-recipe, and common punctuation characters like exclamation marks, question marks, and colons.'

#### Hashtag Removal

In [18]:
re.sub(r'#\S+', '', text)

'This tweet includes  4 mentions to users in the format @username,  in the format  emojis such as 👋, 😌, 😎, 🎉, 😊, URLs like https://example.com/barbecue-recipe, and common punctuation characters like exclamation marks, question marks, and colons.'

#### Link Removal

In [19]:
re.sub(r'http\S+', '', text)

'This tweet includes  4 mentions to users in the format @username, #hashtags in the format #hashtag, emojis such as 👋, 😌, 😎, 🎉, 😊, URLs like  and common punctuation characters like exclamation marks, question marks, and colons.'

# Unit testing

In [20]:
def convert_to_lowercase(text):
    if isinstance(text,str): 
        return text.lower()

def test_convert_to_lowercase(text,expected):
    assert convert_to_lowercase(text)==expected


In [21]:
text='HOLA ME GUSTAN LOS CHOCOLATES'
assert convert_to_lowercase(text)=='hola me gustan los chocolates'

In [22]:
from text_preprocessor import TextPreprocessor

In [25]:
df=pd.read_csv('C:/Clases DMC/CLASES DMC/tweets.csv')
df['content'] = df['content'].astype(str)

In [26]:
preprocessor = TextPreprocessor(remove_links=True, 
                                remove_hastags=True,
                                remove_characters=True, 
                                convert_to_lowercase=True, 
                                remove_emojis=True,
                                remove_numbers=True,
                                remove_stopwords_flag= True)

df['content_2'] = df['content'].apply(preprocessor.preprocess_text)

In [27]:
df[['author','content','content_2']]

Unnamed: 0,author,content,content_2
0,katyperry,Is history repeating itself...?#DONTNORMALIZEH...,history repeating
1,katyperry,@barackobama Thank you for your incredible gra...,barackobama thank incredible grace leadership ...
2,katyperry,Life goals. https://t.co/XIn1qKMKQl,life goals
3,katyperry,Me right now 🙏🏻 https://t.co/gW55C1wrwd,right
4,katyperry,SISTERS ARE DOIN' IT FOR THEMSELVES! 🙌🏻💪🏻❤️ ht...,sisters doin
...,...,...,...
52537,ddlovato,Life couldn't be better right now. 😊,life couldnt better right
52538,ddlovato,First Monday back in action. I'd say 21.6 mile...,first monday back action id say miles success
52539,ddlovato,"Crime shows, buddy, snuggles = the perfect Sun...",crime shows buddy snuggles perfect sunday night
52540,ddlovato,❄️ http://t.co/sHCFdPpGPa,


In [29]:
"""\U0001F600-\U0001F64F: Este rango representa emoticonos. Incluye emojis como caras sonrientes, guiños, gestos faciales, entre otros.
\U0001F300-\U0001F5FF: Aquí se incluyen símbolos y pictogramas que van desde símbolos matemáticos hasta iconos meteorológicos y demás elementos gráficos.
\U0001F680-\U0001F6FF: Este rango abarca símbolos de transporte y mapas, como aviones, cohetes, vehículos, señales de tráfico, entre otros.
\U0001F1E0-\U0001F1FF: Estos son los códigos correspondientes a las banderas, específicamente las banderas que se utilizan en dispositivos iOS."""

'😀-🙏: Este rango representa emoticonos. Incluye emojis como caras sonrientes, guiños, gestos faciales, entre otros.\n🌀-🗿: Aquí se incluyen símbolos y pictogramas que van desde símbolos matemáticos hasta iconos meteorológicos y demás elementos gráficos.\n🚀-\U0001f6ff: Este rango abarca símbolos de transporte y mapas, como aviones, cohetes, vehículos, señales de tráfico, entre otros.\n\U0001f1e0-🇿: Estos son los códigos correspondientes a las banderas, específicamente las banderas que se utilizan en dispositivos iOS.'