<a href="https://colab.research.google.com/github/tamgid/rlms.github.io/blob/main/Preprocessing_Sample_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Import Section
!pip install langdetect #For language identification
from langdetect import detect
import csv
import codecs
import sys
import io
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Download the required resource for tokenization
import unicodedata  #This is for Accented character removal
from nltk.tokenize import TweetTokenizer #For tokenization
from nltk.corpus import stopwords
nltk.download('stopwords') #For Stop word removal
from nltk.stem import PorterStemmer #For Stemming

#Before using HashtagSegmenter - "pip install ekphrasis"
#from ekphrasis.classes.segmenter import Segmenter
#Global Initialization Section
#seg = Segmenter(corpus="twitter")



def languageIdentification(text):
  detected_language = detect(text)
  print("Detected Language:", detected_language)


def hashtagSegmentation(text):
  tokenizer = TweetTokenizer()
  tokens = tokenizer.tokenize(text)
  segmented_tokens = []

  for token in tokens:
      if token.startswith("#"):
          segmented_token = re.sub(r"(?<=\w)([A-Z])", r" \1", token[1:])
          segmented_tokens.extend(segmented_token.split())
      else:
          segmented_tokens.append(token)

  segmented_text = " ".join(segmented_tokens)
  return segmented_text


def specialCharacterRemoval(text):
  # Remove special characters, punctuation, and numbers
  getSpecialCharacterRemovedText = re.sub(r'[^a-zA-Z\s]', '', text)
  return getSpecialCharacterRemovedText


def urlNormalization(text, placeholder="URL"):
  # Regular expression pattern to match URLs
  url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', re.MULTILINE)

  # Replace URLs with the specified placeholder
  processed_text = re.sub(url_pattern, placeholder, text)
  return processed_text


def accentedCharacterRemoval(getTokenizedText):
  # Remove accented characters
  normalized_text = unicodedata.normalize('NFKD', getTokenizedText).encode('ASCII', 'ignore').decode('utf-8')
  return normalized_text


def emojiToTextConversion(text):
  # Convert emojis to text
  converted_text = ""
  i = 0
  while i < len(text):
      if text[i] == '\U0001F1E6':
          # Skip country flag emojis (two characters)
          i += 2
      elif '\U0001F600' <= text[i] <= '\U0001F64F' or '\U0001F300' <= text[i] <= '\U0001F5FF' or '\U0001F680' <= text[i] <= '\U0001F6FF':
          # Range of emojis: 1F600 to 1F64F (Smileys & People), 1F300 to 1F5FF (Symbols), 1F680 to 1F6FF (Transport & Map)
          i += 1
          continue
      else:
          converted_text += text[i]
      i += 1
  return converted_text


def tokenization(getUrlNormalizedText):
  # Tokenize text into words or tokens
  tokens = word_tokenize(getUrlNormalizedText)
  return tokens


custom_stop_words = []
with open('StopWord.txt', 'r') as stop_words_file:
    stop_words_line = stop_words_file.read().strip()  # Read the entire line
    custom_stop_words = stop_words_line.split()  # Split into individual words

def stopWordRemoval(tokenized_text):
  # Remove common stop words
  filtered_tokens = [token for token in tokenized_text if token.lower() not in custom_stop_words]
  return filtered_tokens


stemmer = PorterStemmer()
def stemming(word_list):
    # Function to perform stemming on a list of words
    stemmed_words = [stemmer.stem(word) for word in word_list]
    unique_stemmed_words = []
    [unique_stemmed_words.append(word) for word in stemmed_words if word not in unique_stemmed_words]
    return unique_stemmed_words


lexicon_normalized = {}
with open('LexicalNormalizationData.txt', 'r') as f:
  for line in f:
    original, normalized = line.strip().split()
    lexicon_normalized[original] = normalized

# Define the reverse_normalize_token function
def lexicallyNormalization(token):
    if token in lexicon_normalized:
        return lexicon_normalized[token]
    else:
        return token  # Return the token as is if no original form found


# Data Preprocessing Module
def preProcessingModule(text):
  returnPreProcessedText=""
  languageIdentification(text)
  getHashtagSegmentedText = hashtagSegmentation(text)
  getUrlNormalizedText = urlNormalization(getHashtagSegmentedText)
  getSpecialCharacterRemovedText = specialCharacterRemoval(getUrlNormalizedText)
  getAccentedCharacterRemovedText = accentedCharacterRemoval(getSpecialCharacterRemovedText)
  getEmojiToTextConvertedText = emojiToTextConversion(getAccentedCharacterRemovedText)
  getTokenizedText = tokenization(getEmojiToTextConvertedText)
  getStopWordRemovedText = stopWordRemoval(getTokenizedText)
  getStemmedText = stemming(getStopWordRemovedText)
  getLexicallyNormalizedText = [lexicallyNormalization(token) for token in getStemmedText]
  returnPreProcessedText = getLexicallyNormalizedText
  return returnPreProcessedText


# Main Function Module
def main():
  tweets = []
  label = []
  csv.field_size_limit(500 * 1024 * 1024)
  with open('SemEval2018-IronyDetectionSmallVersion.txt', 'r') as f:
      next(f) # skip headings
      #The content of the file
      #content = f.read()
      #print(content)
      reader=csv.reader(f, dialect="excel-tab")
      for line in reader:
          print("\n")
          print(line[2])
          preProcessedTweetText = preProcessingModule(line[2])
          print(preProcessedTweetText)
          tweets.append(preProcessedTweetText)


if __name__ == '__main__':
  main()





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  http://t.co/fej2v3OUBR
Detected Language: en
['sweet', 'unit', 'nation', 'video', 'time', 'christmas', 'imagine', 'religion', 'url']


@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)
Detected Language: en
['mrdahl', 'rumor', 'talk', 'erv', 'agent', 'angel', 'ask', 'ed', 'escobar', 'that', 'hardli', 'noth']


Hey there! Nice to see you Minnesota/ND Winter Weather 
Detected Language: en
['hey', 'nice', 'see', 'minnesota', 'and', 'winter', 'weather']


3 episodes left I'm dying over here
Detected Language: da
['episode', 'left', 'im', 'die']


I can't breathe! was chosen as the most notable quote of the year in an annual list released by a Yale University librarian 
Detected Language: en
['cant', 'breath', 'chosen', 'notabl', 'quot', 'year', 'annual', 'list', 'release', 'yale', 'university', 'librarian']


You're never too old for F

In [2]:
from google.colab import files
uploaded = files.upload()

Saving SemEval2018-IronyDetectionSmallVersion.txt to SemEval2018-IronyDetectionSmallVersion.txt


In [3]:
from google.colab import files
uploaded = files.upload()

Saving LexicalNormalizationData.txt to LexicalNormalizationData.txt


In [4]:
from google.colab import files
uploaded = files.upload()

Saving StopWord.txt to StopWord.txt
