In [None]:
#!pip install langdetect

from google.colab import drive
drive.mount("/content/drive/")

In [None]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch
import shutil
import sys

from collections import Counter
from langdetect import detect
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
dataset = pd.read_csv('drive/MyDrive/NLP_Final_Project/dataset/quotes.csv', delimiter=',') # Rohit
# dataset = pd.read_csv('drive/MyDrive/NLP/quotes.csv', delimiter=',')
nRow, nCol = dataset.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
# Null Value check
print(dataset.isnull().sum())
dataset.dropna(inplace=True)
dataset.quote.drop_duplicates(inplace=True)

# Remove commas from categories. ['Life, Motivation']
dataset['category'] = dataset['category'].str.replace(', ', ' ')

# Strip extra spaces in category
dataset['category'] = dataset['category'].str.strip()
dataset['quote'] = dataset['quote'].str.strip()
dataset['author'] = dataset['author'].str.strip()

# Convert upper case to lower case
dataset['category'] = dataset['category'].str.lower()
dataset['quote'] = dataset['quote'].str.lower() ## comment if needed
dataset['author'] = dataset['author'].str.lower() ## comment if needed

In [None]:
dataset.head()

In [None]:
# Creating a dictionary to store count for all tags.

tags_dict = {}

for row in dataset.itertuples():
    for cat in row.category.split(' '):
        if cat=='':
            continue
        tags_dict[cat.lower()] = tags_dict.get(cat.lower(), 0) + 1

sorted_tags_dict = dict(sorted(tags_dict.items(), key=lambda x: x[1], reverse=True))

In [None]:
# Remove all the tags whose count is below the specified threshold

threshold = 1500

tags_to_remove = [key for key, value in sorted_tags_dict.items() if value < threshold]

# Remove items from the dictionary
for key in tags_to_remove:
    del sorted_tags_dict[key]

print(f'After removing less frequent tags: there are {len(sorted_tags_dict)} types of different tags')

In [None]:
# List of tags to keep
top_k = 5
tags_to_keep = [tag for tag in list(sorted_tags_dict.keys())[:top_k]]
print(f'Top {top_k} tags are {tags_to_keep}')

# Remove all the rows where the quote lenght longer than a certain length.

min_lenth_threshold = 25
max_length_threshold = 256
max_len_filtered_dataset = dataset[dataset['quote'].apply(len) <= max_length_threshold].copy()
min_len_filtered_dataset = max_len_filtered_dataset[max_len_filtered_dataset['quote'].apply(len) >= min_lenth_threshold].copy()

print(f'After removing all quotes greater than length {max_length_threshold} and lesser than length {min_lenth_threshold}, there are {len(dataset)} rows remaining.')

min_len_filtered_dataset.head(5)

In [None]:
min_len_filtered_dataset['tags'] = min_len_filtered_dataset['category'].apply(lambda x: [tag for tag in tags_to_keep if tag in x.split()])
min_len_filtered_dataset['tags'] = min_len_filtered_dataset['tags'].apply(lambda x: '_'.join(x))
min_len_filtered_dataset = min_len_filtered_dataset[min_len_filtered_dataset['tags']!='']
print(f'Number of rows after removing quote not related to top_k tags : {len(min_len_filtered_dataset)}')

In [None]:
# min_len_filtered_dataset['is_eng'] = min_len_filtered_dataset['quote'].apply(lambda x: detect(x)=='en')

In [None]:
min_len_filtered_dataset[min_len_filtered_dataset['is_eng'] == False].head(5)

In [None]:
min_len_filtered_dataset = min_len_filtered_dataset[min_len_filtered_dataset['is_eng'] == True]

# min_len_filtered_dataset.to_csv('drive/MyDrive/NLP_Final_Project/dataset/eng_len_filtered_quotes.csv', index=False, encoding='utf-8')

In [None]:
sampled_df = min_len_filtered_dataset.drop(['author','category', 'is_eng'], axis=1)

In [None]:
df = sampled_df

df['tags_list'] = df['tags'].str.split('_')

tag_counts = Counter(tag for tags_list in df['tags_list'] for tag in tags_list)

print('Count of each tag in dataset :')
for tag in tag_counts:
    print(tag, ':', tag_counts[tag])
# tag_counts

In [None]:
sampled_dataframes = {}
tags = ['love','life','inspirational', 'philosophy', 'humor']
for tag in tags:
    sampled_dataframes[tag] = sampled_df[sampled_df['tags'].str.contains(tag)].sample(n=6000, random_state=42)

for tag, df in sampled_dataframes.items():
    sampled_dataframes[tag] = df.drop_duplicates(subset='quote', keep='first')

final_df = pd.concat(sampled_dataframes.values())

final_df.reset_index(drop=True, inplace=True)

print('Final Dataset Shape :',final_df.shape)
final_df.sample(5)

In [None]:
def split_data(df, random_seed):

  train_val, test_df = train_test_split(df, test_size=0.2, random_state=random_seed, stratify=df['tags'])
  train_df, val_df = train_test_split(train_val, test_size=0.15, random_state=random_seed, stratify=train_val['tags'])

  for d in [train_df, val_df, test_df]:
    tag_count = {}
    for tag in d['tags_list']:
        for t in tag:
        tag_count[t] = tag_count.get(t, 0) + 1
    print(tag_count)
  return train_df, val_df, test_df

In [None]:
final_df['tags_list'] = final_df['tags'].str.split('_')

# Count individual tag occurrences
tag_counts = Counter(tag for tags_list in final_df['tags_list'] for tag in tags_list)

tag_counts

In [None]:
final_df['tag_count'] = final_df['tags_list'].apply(len)
final_df['tag_count'].value_counts()