# PREPROCESSING DATA

In [1]:
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from functools import reduce
with open('stopwords.txt') as f:
    stopwords_list = []
    for row in f:
        stopwords_list.append(row.rstrip('\n'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
tweet = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [3]:
# method
def remove_URL(text):
    # url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(r'http\S+','', text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_stopwords(text):
        # stopwords_list = stopwords.words('english')
    return ' '.join([word for word in text.split() if word not in stopwords_list])

def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_invalid_char(text):
    return re.sub(r'[^a-zA-Z0-9\s]','',text)

def remove_leading_whitespace(text):
    return text.strip()

def to_lower(text):
    return text.lower()

def remove_mention(text):
    return re.sub(r'@\S+','',text)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(text)

def stemming(text):
    ps = PorterStemmer()
    words = word_tokenize(text)
    return reduce(lambda x, y: x + " " + ps.stem(y), words, "")
    return ' '.join([ps.stem(word) for word in words])

In [4]:
remove_stopwords("I'm on top of the hill and I can see a fire in the woods...".lower())

'top hill can see fire woods...'

In [5]:
def preprocessing_csv(df, type_file = 'train'):
    returned_df = df.copy()
    dict_func = [
                to_lower,
                remove_stopwords,
                remove_URL,
                remove_html,
                # remove_emoji,
                # remove_punct,
                remove_mention,
                remove_leading_whitespace,
                remove_invalid_char,
                #  lemmatize,
                # stemming
    ]
    for func in dict_func:
        returned_df['text'] = returned_df['text'].apply(lambda x: func(x))

    returned_df['keyword'] = returned_df['keyword'].str.replace('%20', ' ')
    
    # df.to_csv(f'./preprocessing/{type_file}.csv', index = False)

    # print(f'Tiền xử lý vào ghi dữ liệu của tập {type_file} thành công !!')
    if(type_file == 'train'):
        empty_text_rows = returned_df[returned_df['text'] == '']
        returned_df = returned_df.drop(empty_text_rows.index)
    # elif(type == 'test'):
        
    return returned_df

In [6]:
df = preprocessing_csv(tweet, 'train')
df.to_csv(f'./preprocessing/train.csv', index = False)
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent photo ruby alaska smoke wildfire...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding bridge collapse nearb...,1
7609,10870,,,control wild fires california even northern pa...,1
7610,10871,,,m194 0104 utc5km s volcano hawaii,1
7611,10872,,,police investigating ebike collided car little...,1


In [7]:
# Remove the similar tweet
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def out_duplicate_text(df, outFile):
# Create a sample dataframe
# df = pd.DataFrame({'text': ['I love banana', 'I very love banana', 'I hate apples', 'I like oranges']})

# Convert the text data into a matrix of TF-IDF features
    # df = preprocessing_csv(tweet)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['text'])

    # Calculate the cosine similarity between each pair of text data
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Define a threshold for similarity score
    threshold = 0.7

    # Group the text data based on similarity score
    # groups = []
    # visited = set()

    # for i in range(len(cosine_sim)):
    #     if i not in visited:
    #         group = [i]
    #         for j in range(i+1, len(cosine_sim)):
    #             if cosine_sim[i][j] >= threshold and cosine_sim:
    #                 group.append(j)
    #                 visited.add(j)
    #         groups.append(group)

    groups = {}
    for i in range(len(df)):
        group_id = None
        for j in range(i):
            if cosine_sim[i,j] > threshold:
                if group_id is None:
                    group_id = j
                elif df["target"][j] > df["target"][group_id]:
                    group_id = j
        if group_id is None:
            group_id = i
        groups.setdefault(group_id, []).append(i)
    
    count = 0
    
    with open(outFile, "w") as f:
        for group in groups.values():
            if(len(group) > 1):
                f.write(f'Group {count}:')
                for index in group:
                    f.write(f' - id={df.iloc[index]["id"]}, {df.iloc[index]["text"]}, target={df.iloc[index]["target"]}, location={df.iloc[index]["location"]}, key={df.iloc[index]["keyword"]}')
                    f.write('\n')
                    # print(f' - id={df.iloc[index]["id"]}, {df.iloc[index]["text"]}, target={df.iloc[index]["target"]}, location={df.iloc[index]["location"]}, key={df.iloc[index]["keyword"]}')
                count = count + 1
    # return groups
    # # Print the groups
    # with open(outFile, "w") as f:
    #     for i, group in enumerate(groups):
    #         if(len(group) > 1):
    #             f.write(f'Group {i+1}:')
    #             f.write('\n')
    #             for index in group:
    #                 f.write(f' - id={df.iloc[index]["id"]}, {df.iloc[index]["text"]}, target={df.iloc[index]["target"]}, location={df.iloc[index]["location"]}, key={df.iloc[index]["keyword"]}')
    #                 f.write('\n')

In [8]:
out_duplicate_text(df=df, outFile="identical_rows.txt")

KeyError: 4497

In [None]:
# Remove the similar tweet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def out_duplicate_text_with_same_loc(df, outFile):
# Create a sample dataframe
# df = pd.DataFrame({'text': ['I love banana', 'I very love banana', 'I hate apples', 'I like oranges']})

# Convert the text data into a matrix of TF-IDF features
    # df = preprocessing_csv(tweet)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df['text'])

    # Calculate the cosine similarity between each pair of text data
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Define a threshold for similarity score
    threshold = 0.7

    # Group the text data based on similarity score
    # groups = []
    # visited = set()

    # for i in range(len(cosine_sim)):
    #     if i not in visited:
    #         group = [i]
    #         for j in range(i+1, len(cosine_sim)):
    #             if cosine_sim[i][j] >= threshold and cosine_sim:
    #                 group.append(j)
    #                 visited.add(j)
    #         groups.append(group)

    groups = {}
    for i in range(len(df)):
        group_id = None
        for j in range(i):
            if cosine_sim[i,j] > threshold and df['location'][i] == df['location'][j]:
                if group_id is None:
                    group_id = j
                elif df["target"][j] > df["target"][group_id]:
                    group_id = j
        if group_id is None:
            group_id = i
        groups.setdefault(group_id, []).append(i)
    
    count = 0
    
    with open(outFile, "w") as f:
        for group in groups.values():
            if(len(group) > 1):
                f.write(f'Group {count}:')
                for index in group:
                    f.write(f' - id={df.iloc[index]["id"]}, {df.iloc[index]["text"]}, target={df.iloc[index]["target"]}, location={df.iloc[index]["location"]}, key={df.iloc[index]["keyword"]}')
                    f.write('\n')
                    # print(f' - id={df.iloc[index]["id"]}, {df.iloc[index]["text"]}, target={df.iloc[index]["target"]}, location={df.iloc[index]["location"]}, key={df.iloc[index]["keyword"]}')
                count = count + 1
    # return groups
    # # Print the groups
    # with open(outFile, "w") as f:
    #     for i, group in enumerate(groups):
    #         if(len(group) > 1):
    #             f.write(f'Group {i+1}:')
    #             f.write('\n')
    #             for index in group:
    #                 f.write(f' - id={df.iloc[index]["id"]}, {df.iloc[index]["text"]}, target={df.iloc[index]["target"]}, location={df.iloc[index]["location"]}, key={df.iloc[index]["keyword"]}')
    #                 f.write('\n')

In [None]:
out_duplicate_text_with_same_loc(df=preprocessing_csv(tweet),outFile='identical_rows_with_same_loc.txt')

KeyboardInterrupt: 

In [None]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Load the dataframe
# df = preprocessing_csv(tweet)

# # Define a function to group similar text
# def group_similar_text(df, threshold=0.7):
#     # Use TfidfVectorizer to transform the text into a vector representation
#     vectorizer = TfidfVectorizer()
#     vectors = vectorizer.fit_transform(df['text'])

#     # Use cosine_similarity to calculate pairwise similarities between the vectors
#     similarities = cosine_similarity(vectors)

#     # Create a dictionary to store the groups
#     groups = {}
#     for i in range(len(df)):
#         found_group = False
#         id = df.loc[i, 'id']
#         keyword = df.loc[i, 'keyword']
#         location = df.loc[i, 'location']
#         text = df.loc[i, 'text']
#         target = df.loc[i, 'target']

#         # Check if the text belongs to an existing group
#         for group_text, group_target in groups.items():
#             score = similarities[i, group_target[0]]
#             if score >= threshold:
#                 group_target.append(target)
#                 found_group = True
#                 break

#         # If the text doesn't belong to an existing group, create a new one
#         if not found_group:
#             groups[text] = [i, id, keyword, location, target]
        
#     # Create a new dataframe with the most frequent target for each group
#     new_data = {'id':[], 'keyword': [], 'location':[], 'text': [], 'target': [], 'group_size': []}
#     for group_text, group_info in groups.items():
#         # Get the indices of the rows in the group
#         group_indices = [group_info[0]]
#         for i in range(len(df)):
#             if i != group_info[0] and similarities[i, group_info[0]] >= threshold:
#                 group_indices.append(i)

#         # Get the most frequent target in the group
#         group_targets = [df.loc[i, 'target'] for i in group_indices]
#         most_frequent_target = max(set(group_targets), key=group_targets.count)

#         # Add the group to the new dataframe
#         new_data['id'].append()
#         new_data['text'].append(group_text)
#         new_data['target'].append(most_frequent_target)
#         new_data['group_size'].append(len(group_indices))

#     # Create the final dataframe
#     new_df = pd.DataFrame(new_data)

#     return new_df

# # Group the similar text in the dataframe
# new_df = group_similar_text(df, threshold=0.7)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def group_similar_texts(df, threshold):
    """
    Group similar texts in a DataFrame based on cosine similarity of TF-IDF vectors,
    and keep the row with the highest frequency of target in each group.

    Parameters:
    - df: pandas DataFrame
        The input DataFrame containing the id, keyword, location, text, and target columns.
    - threshold: float
        The cosine similarity threshold above which texts are considered similar.

    Returns:
    - grouped_df: pandas DataFrame
        A new DataFrame containing the text data for each group and the most common target,
        as well as the id, keyword, and location columns of the row with the highest frequency of target.
    """

    # Convert the text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["text"])

    # Calculate the pairwise cosine similarity between documents
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Group similar documents with the given threshold and keep the row with the highest frequency of target
    groups = {}
    for i in range(len(df)):
        group_id = None
        for j in range(i):
            if cosine_sim[i,j] > threshold:
                if group_id is None:
                    group_id = j
                elif df["target"][j] > df["target"][group_id]:
                    group_id = j
        if group_id is None:
            group_id = i
        groups.setdefault(group_id, []).append(i)


    # Create a new DataFrame with the grouped data and the most common target in each group
    grouped_data = []
    for group in groups.values():
        target_freq = df.loc[group]["target"].value_counts()
        most_common_target = target_freq.index[0]
        representative = df.loc[(df["target"] == most_common_target) & (df.index.isin(group))].iloc[0]        
        grouped_data.append({
            "id": representative["id"],
            "keyword": representative["keyword"],
            "location": representative["location"],
            "text": ", ".join(df.loc[group]["text"]),
            "target": most_common_target
        })
    grouped_df = pd.DataFrame(grouped_data)

    return grouped_df

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def group_similar_texts_with_same_loc(df, threshold):
    """
    Group similar texts in a DataFrame based on cosine similarity of TF-IDF vectors,
    and keep the row with the highest frequency of target in each group.

    Parameters:
    - df: pandas DataFrame
        The input DataFrame containing the id, keyword, location, text, and target columns.
    - threshold: float
        The cosine similarity threshold above which texts are considered similar.

    Returns:
    - grouped_df: pandas DataFrame
        A new DataFrame containing the text data for each group and the most common target,
        as well as the id, keyword, and location columns of the row with the highest frequency of target.
    """

    # Convert the text data into numerical features using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["text"])

    # Calculate the pairwise cosine similarity between documents
    cosine_sim = cosine_similarity(tfidf_matrix)

    # Group similar documents with the given threshold and keep the row with the highest frequency of target
    groups = {}
    for i in range(len(df)):
        group_id = None
        for j in range(i):
            if cosine_sim[i,j] > threshold and df['location'][i] == df['location'][j]:
                if group_id is None:
                    group_id = j
                elif df["target"][j] > df["target"][group_id]:
                    group_id = j
        if group_id is None:
            group_id = i
        groups.setdefault(group_id, []).append(i)

    # Create a new DataFrame with the grouped data and the most common target in each group
    grouped_data = []
    for group in groups.values():
        target_freq = df.loc[group]["target"].value_counts()
        most_common_target = target_freq.index[0]
        representative = df.loc[(df["target"] == most_common_target) & (df.index.isin(group))].iloc[0]        
        grouped_data.append({
            "id": representative["id"],
            "keyword": representative["keyword"],
            "location": representative["location"],
            "text": ", ".join(df.loc[group]["text"]),
            "target": most_common_target
        })
    grouped_df = pd.DataFrame(grouped_data)

    return grouped_df

In [None]:
train1_df = group_similar_texts(df, 0.7)
train1_df.to_csv(f'./preprocessing/train1.csv', index = False)

KeyboardInterrupt: 

In [None]:
train_same_loc_df = group_similar_texts_with_same_loc(df, 0.7)
train_same_loc_df.to_csv(f'./preprocessing/train_same_loc.csv', index = False)

In [None]:
df

In [None]:
pd.read_csv('./preprocessing/train1.csv')

In [None]:
pd.read_csv('./preprocessing/train_same_loc.csv')