# PREPROCESSING DATA

In [21]:
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
tweet = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [23]:
# method
def remove_URL(text):
    # url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(r'http\S+','', text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_stopwords(text):
    stopwords_list = stopwords.words('english')
    return ' '.join([word for word in text.split() if word not in stopwords_list])

def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_invalid_char(text):
    return re.sub(r'[^a-zA-Z0-9\s]','',text)

def remove_leading_whitespace(text):
    return text.strip()

def to_lower(text):
    return text.lower()

def remove_mention(text):
    return re.sub(r'@\S+','',text)

In [24]:
def preprocessing_csv(df, type_file = 'train'):
    dict_func = {remove_URL,
                 remove_html,
                 remove_emoji,
                 remove_punct,
                 remove_stopwords,
                 remove_mention,
                 remove_leading_whitespace,
                 to_lower,
                 remove_invalid_char} 
    for func in dict_func:
        df['text'] = df['text'].apply(lambda x: func(x))

    
    df['keyword'] = df['keyword'].str.replace('%20', ' ')
    
    df.to_csv(f'./preprocessing/{type_file}.csv', index = False)

    print(f'Tiền xử lý vào ghi dữ liệu của tập {type_file} thành công !!')
    return df

In [28]:
# Remove the similar tweet
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a sample dataframe
# df = pd.DataFrame({'text': ['I love banana', 'I very love banana', 'I hate apples', 'I like oranges']})

# Convert the text data into a matrix of TF-IDF features
df = preprocessing_csv(tweet)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Calculate the cosine similarity between each pair of text data
similarity_matrix = cosine_similarity(tfidf_matrix)

# Define a threshold for similarity score
threshold = 0.7

# Group the text data based on similarity score
groups = []
visited = set()

for i in range(len(similarity_matrix)):
    if i not in visited:
        group = [i]
        for j in range(i+1, len(similarity_matrix)):
            if similarity_matrix[i][j] >= threshold:
                group.append(j)
                visited.add(j)
        groups.append(group)

# Print the groups
for i, group in enumerate(groups):
    if(len(group) > 1):
        print(f'Group {i+1}:')
        for index in group:
            print(f' - {df.iloc[index]["text"]}, target={df.iloc[index]["target"]}')

Tiền xử lý vào ghi dữ liệu của tập train thành công !!
Group 41:
 - check nsfw, target=0
 - check nsfw, target=0
Group 47:
 - west burned thousands wildfires ablaze california alone, target=1
 - west burned thousands wildfires ablaze california alone climate energy, target=1
Group 73:
 - i77 mile marker 31 south mooresville iredell vehicle accident ramp closed 86 118 pm, target=1
 - i77 mile marker 31 40 south mooresville iredell vehicle accident congestion 86 118 pm, target=1
Group 102:
 - 320 ir icemoon aftershock djicemoon dubstep trapmusic dnb edm dance ices, target=0
 - 320 ir icemoon aftershock djicemoon dubstep trapmusic dnb edm dance ices, target=0
 - 320 ir icemoon aftershock djicemoon dubstep trapmusic dnb edm dance ices, target=0
 - 320 ir icemoon aftershock djicemoon dubstep trapmusic dnb edm dance ices, target=0
 - 320 ir icemoon aftershock djicemoon dubstep trapmusic dnb edm dance ices, target=0
 - 320 ir icemoon aftershock djicemoon dubstep trapmusic dnb edm dance ices, 

In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataframe
df = preprocessing_csv(tweet)

# Define a function to group similar text
def group_similar_text(df, threshold=0.7):
    # Use TfidfVectorizer to transform the text into a vector representation
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(df['text'])

    # Use cosine_similarity to calculate pairwise similarities between the vectors
    similarities = cosine_similarity(vectors)

    # Create a dictionary to store the groups
    groups = {}
    for i in range(len(df)):
        found_group = False
        text = df.loc[i, 'text']
        target = df.loc[i, 'target']

        # Check if the text belongs to an existing group
        for group_text, group_target in groups.items():
            score = similarities[i, group_target[0]]
            if score >= threshold:
                group_target.append(target)
                found_group = True
                break

        # If the text doesn't belong to an existing group, create a new one
        if not found_group:
            groups[text] = [i, target]
        
    # Create a new dataframe with the most frequent target for each group
    new_data = {'text': [], 'target': [], 'group_size': []}
    for group_text, group_info in groups.items():
        # Get the indices of the rows in the group
        group_indices = [group_info[0]]
        for i in range(len(df)):
            if i != group_info[0] and similarities[i, group_info[0]] >= threshold:
                group_indices.append(i)

        # Get the most frequent target in the group
        group_targets = [df.loc[i, 'target'] for i in group_indices]
        most_frequent_target = max(set(group_targets), key=group_targets.count)

        # Add the group to the new dataframe
        new_data['text'].append(group_text)
        new_data['target'].append(most_frequent_target)
        new_data['group_size'].append(len(group_indices))

    # Create the final dataframe
    new_df = pd.DataFrame(new_data)

    return new_df

# Group the similar text in the dataframe
new_df = group_similar_text(df, threshold=0.7)

Tiền xử lý vào ghi dữ liệu của tập train thành công !!


In [36]:

new_df.to_csv(f'./preprocessing/new.csv', index = False)

In [38]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding bridge collapse nearb...,1
7609,10870,,,ariaahrary thetawniest control wild fires cali...,1
7610,10871,,,m194 0104 utc5km volcano hawaii,1
7611,10872,,,police investigating ebike collided car little...,1


In [26]:
preprocessing_csv(tweet)

Tiền xử lý vào ghi dữ liệu của tập train thành công !!


Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding bridge collapse nearb...,1
7609,10870,,,ariaahrary thetawniest control wild fires cali...,1
7610,10871,,,m194 0104 utc5km volcano hawaii,1
7611,10872,,,police investigating ebike collided car little...,1


In [27]:
preprocessing_csv(test, 'test')

Tiền xử lý vào ghi dữ liệu của tập test thành công !!


Unnamed: 0,id,keyword,location,text
0,0,,,just happened terrible car crash
1,2,,,heard earthquake different cities stay safe ev...
2,3,,,forest fire spot pond geese fleeing across str...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills 28 china taiwan
...,...,...,...,...
3258,10861,,,earthquake safety los angeles safety fasteners...
3259,10865,,,storm ri worse last hurricane my cityamp3other...
3260,10868,,,green line derailment chicago
3261,10874,,,meg issues hazardous weather outlook hwo
