In [2]:
import nltk
import string
import re
import requests
import pandas as pd
import json
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_excel('train_data.xlsx')
df.head(20)

# Text Preprocessing  functions

In [None]:
# 1- Clear out HTML characters 
import html
def clear_HTMLcharacters(text):
    text = html.unescape(text)
    return text


# 2- Encoding UTF-8 
def encode_utf8(text):
    text = text.encode('ascii','ignore')
    encode_tweet=text.decode(encoding='UTF-8')
    return encode_tweet


# 3- Extract Hashtags
def extract_hashtags(text):
    hashtag_list = []
    for word in text.split():
        if word[0] == '#':
            hashtag_list.append(word[1:])
    return hashtag_list


# 4- Removing URLs, Hashtags and Styles
def remove_URL_Hashtags_Style(text):  
    text = re.sub(r'https?:\/\/.\S+', "", text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    return text


# 5- Convert Numbers into words
import inflect
p = inflect.engine()
def convert_number(text):
    temp_str = text.split()
    new_string = []
    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
        else:
            new_string.append(word)
    temp_str = ' '.join(new_string)
    temp_str = re.sub(r'\d+', '', temp_str)
    return temp_str


# 6- Text Lowercase
def text_lowercase(text):
    return text.lower()


# 7- Replace Contraction 
Apos_dict={"'s":" is","n't":" not","'m":" am","'ll":" will","'d":" would","'ve":" have","'re":" are"}
def replace_contraction(text):
    for key,value in Apos_dict.items():
            if key in text:
                text=text.replace(key,value)
    return text     


# 8- Split attached words
def split_attached_words(text):
    text = " ".join([s for s in re.split("([A-Z][a-z]+[^A-Z]*)",text) if s])
    return text


# 9- Remove More Than twice repeat letter
import itertools
def remove_more_than_twice(text):
    #One letter in a word should not be present more than twice in continuation
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    return text

# 10- Spell Checking
from autocorrect import Speller
spell = Speller(lang='en')
def spell_checking(text):
    text=spell(text)
    return text


# 11- Slang lookup
file=open("slang.txt","r")
slang=file.read()
slang=slang.split('\n')

def slang_lookup(text):
    text_tokens=text.split()
    slang_word=[]
    meaning=[]
    for line in slang:
        temp=line.split("=")
        slang_word.append(temp[0])
        meaning.append(temp[-1])
    for i,word in enumerate(text_tokens):
        if word in slang_word:
            idx=slang_word.index(word)
            text_tokens[i]=meaning[idx]
         
    text=" ".join(text_tokens)
    return text


# 12- Remove Punctuations
import string   
def remove_punctuations(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text
 
    

# 13- Remove whitespaces
def remove_whitespace(text):
    return  " ".join(text.split())
  
    
# 14- Remove  stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
en_stops = set(stopwords.words('english'))
def remove_stopwords(text):
    text_tokens = word_tokenize(text)
    filtered_text = [word for word in text_tokens if word not in en_stops]
    text = " ".join(filtered_text)
    text = re.sub(r'\d+', '', text)
    return text


# 15- Stemming
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    text  =" ".join(stems)
    return text

# total preprocess tweets

In [None]:
def preprocess_tweets(text):
    text = clear_HTMLcharacters(text)
    text = encode_utf8(text)
    text = remove_URL_Hashtags_Style(text)
    text = split_attached_words(text)
    text = remove_more_than_twice(text)
    text = text_lowercase(text)
    text = convert_number(text)
    text = replace_contraction(text)
    text = remove_punctuations(text)      
    text = slang_lookup(text)      
    text = remove_whitespace(text)      
    text = remove_stopwords(text)
    text = spell_checking(text)
    text = stem_words(text)
    return text


In [None]:
count = 0
for i in df["Text"]:
    objects = []
    tweet = i
    try:
        preprocess_tweet = preprocess_tweets(tweet)
        df.loc[[count], "preprocess_tweets"] = preprocess_tweet
        print("tweet number ", count, "is preprocessed")
        count = count + 1
    except Exception as e:
        print(e)
        count = count + 1
        pass

# Find Image classes with EfficientNet 

In [None]:
import torch
from torchvision import transforms
from efficientnet_pytorch import EfficientNet

model = EfficientNet.from_pretrained('efficientnet-b7')
model.eval()

tfms = transforms.Compose([transforms.Resize(224),
                           transforms.ToTensor(),
                           transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])

labels_map = json.load(open('labels.txt'))
labels_map = [labels_map[str(i)] for i in range(1000)]

In [None]:
from PIL import Image
from io import BytesIO

for i in range(75,500):
    objects = []   
    print(i)
    url = df["Media URLs"][i]
    print(url)
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img = tfms(img).unsqueeze(0)
        with torch.no_grad():
             outputs = model(img)
        for idx in torch.topk(outputs, k=10).indices.squeeze(0).tolist():
            prob = torch.softmax(outputs, dim=1)[0, idx].item()
            label=labels_map[idx]
            objects.append(label)
        print(objects)
        df.loc[[i], "object"] = (' '.join(objects))
    except Exception as e:
        print(e)
        df.loc[[i], "object"] = (' '.join(objects))
        pass

In [None]:
df = df[['preprocess_tweets','object','Emoji']]
df.head(20)

In [None]:
df.to_excel("preprocess_data.xlsx")