reference: https://github.com/VinAIResearch/BERTweet/blob/master/TweetNormalizer.py

In [1]:
import os
import re
import pandas as pd
import numpy as np
# from PIL import Image
import torch
# from transformers import BertTokenizer
from emoji import demojize
import argparse
from sklearn.model_selection import train_test_split


## Text Processing

In [None]:
from emoji import demojize
from nltk.tokenize import TweetTokenizer


tokenizer = TweetTokenizer()

# 需要先判断被::包起来的文本是不是emoji，是的话先转成emoji不要避免被tokenize了
def emoji_tokenize(text):
    # 提取被::包起来的文本
    pattern = r':(.*?):'
    matches = re.findall(pattern, text)
    # 将被::包起来的文本替换为emoji
    for match in matches:
        emoji_text = emojize(f":{match}:")
        text = text.replace(f":{match}:", emoji_text)
    return text


def normalize_punctuation(text):
    """
    合并连续的标点符号，例如：
    '!!!' -> '!', '???' -> '?', '??!!' -> '?'
    """

    punct_groups_dict = {
        '!': r"[!]{2,}",       # multiple !
        '?': r"[?]{2,}",       # multiple ?
        '...': r"[...]{2,}",       # multiple .
        '!?': r"[!?]{2,}",      # mix ?!?!?!
    }

    for replacement, pattern in punct_groups_dict.items():
        text = re.sub(pattern, replacement, text)

    return text


def normalize_token(token):
    # convert all tokens to lowercase
    lowercased_token = token.strip().lower()
    # convert user mentions into special tokens @USER
    if token.startswith("@"):
        return "@USER" # issue: some people may use @ for location
    # convert web/url links to HTTPURL
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    # translate emotion icons into text strings
    elif len(token) == 1:
        # print("processing single char:", token)
        result = demojize(token)
        # print("result:", result)
        return result
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalize_tweet(tweet):

    tweet = tweet.replace("’", "'")

    # emoji normalization
    tweet = emoji_tokenize(tweet)

    # punctuation normalization
    tweet = normalize_punctuation(tweet)

    # tweet tokenization
    tokens = tokenizer.tokenize(tweet)
    
    # token normalization
    norm_tweet = " ".join([normalize_token(token) for token in tokens])

    norm_tweet = (
        norm_tweet.replace("cannot ", "can not ")
        .replace("n't ", " n't ")
        .replace("n 't ", " n't ")
        .replace("ca n't", "can't")
        .replace("ai n't", "ain't")
    )
    norm_tweet = (
        norm_tweet.replace("'m ", " 'm ")
        .replace("'re ", " 're ")
        .replace("'s ", " 's ")
        .replace("'ll ", " 'll ")
        .replace("'d ", " 'd ")
        .replace("'ve ", " 've ")
    )
    norm_tweet = (
        norm_tweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(norm_tweet.split())

In [6]:
print(
    normalize_tweet(
        "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
    )
)

SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER


In [None]:
# 5127.txt
print(
  normalize_tweet(
    "RT @bookmyshow: #Disney's lies - \"Every day is a good hair day\". Yeah, right!!: http://t.co/351AQVV7gA http://t.co/fFH59wGwbs"
  )
)

# TODO: may consider removing retweet; concern: the sentiment after retweet might be opposite to the original

RT @USER : #Disney 's lies - " Every day is a good hair day " . Yeah , right ! : HTTPURL HTTPURL


In [8]:
# 5125.txt
print(
  normalize_tweet(
    "http://t.co/Y4ZrSmq2KB http://t.co/L…"
  )
)

HTTPURL HTTPURL


In [9]:
import chardet

with open('data/2.txt', 'rb') as f:
    data = f.read()
    result = chardet.detect(data)
    print(result)  # {'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [11]:
from tqdm import tqdm

# process all .txt files in the /data directory
def processFiles(input_dir="data", output_dir="data_processed"):
    # create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # iterate through all files in the input directory
    # sort the files according to numbers in the filename
    for filename in tqdm(sorted(os.listdir(input_dir), key=lambda x: int(x.split(".")[0]))):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, "r", encoding="ISO-8859-1") as f:
                # print("processing file:", filename)
                lines = f.readlines()

            # process each line in the file
            processed_lines = []
            for line in lines:
                processed_line = normalize_tweet(line.strip())
                processed_lines.append(processed_line)

            # write the processed lines to a new file in the output directory
            output_file_path = os.path.join(output_dir, filename)
            with open(output_file_path, "w", encoding="utf-8") as f:
                f.write("\n".join(processed_lines))

processFiles()

100%|██████████| 9738/9738 [00:02<00:00, 4425.68it/s]
