### **Tweet Preprocessing**

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def text_preprocessing(text):
    # Remove Leading Blank Spaces
    text = text.strip()

    # Lower Case
    text = text.lower()

    # Remove URLS
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    text = re.sub(url_pattern, "", text)

    # Remove UserName
    username_pattern = re.compile(r"@\w+")
    text = re.sub(username_pattern, "", text)

    # Remove Hashtags
    hashtag_pattern = re.compile(r"#\w+")
    text = re.sub(hashtag_pattern, "", text)

    # Character normalization // todaaaaay -> today
    text = re.sub(r"([a-zA-Z])\1{2,}", r'\1', text)

    # Remove Special Characters
    text = re.sub(r'[^a-zA-Z\s]', "", text)

    # Word Tokenizer
    text = word_tokenize(text)

    # Remove Stop Words
    stop_words = set(stopwords.words("english"))
    text = [word for word in text if word not in stop_words]

    # Lemmatization
    def get_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"N": "n", "V": "v", "R": "r", "J": "a"}
        return tag_dict.get(tag, "n")

    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word, pos=get_pos(word)) for word in text]

    return text

# Example usage:
sample_text = "@Dia jsut poseted someting (*)@(*$)(*)#@)!!! Im screaming ^U^"
processed_text = text_preprocessing(sample_text)
print(processed_text)


['jsut', 'poseted', 'someting', 'im', 'scream', 'u']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.11.0-py2.py3-none-any.whl (433 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/433.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/433.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m368.6/433.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.11.0


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
import emoji

lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweet(tweet):
    tokens = tokenizer.tokenize(tweet)
    tokens = [token.lower() for token in tokens]
    tokens = [re.sub(r'https?://\S+|www\.\S+', '', token) for token in tokens]
    tokens = [re.sub(r'\W', ' ', token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [emoji.demojize(token) for token in tokens]
    contractions = {
        "can't": "cannot",
        "won't": "will not",
    }
    tokens = [contractions[token] if token in contractions else token for token in tokens]
    tokens = [token for token in tokens if not token.startswith('@')]
    processed_tweet = ' '.join(tokens)

    return processed_tweet

preprocess_tweet("@Dia jsut poseted someting (*)@(*$)(*)#@)!!! Im screaming ^U^😃")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'jsut poseted someting im screaming u'

### **Packages**

In [None]:
import re
import nltk
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import emoji

import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import SimpleRNN
from gensim.models import Word2Vec



import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention

from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')