#### Text Processing for Pre-trained/LLM Models
#### Python
#### Ref: https://www.kaggle.com/code/praveenkotha2/end-to-end-text-processing-for-beginners/notebook

In [6]:
import os
import pickle
import re
import string
from tqdm import tqdm
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import word2vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from textblob import TextBlob
from wordcloud import WordCloud
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
init_notebook_mode(connected=True)
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')


In [11]:
# Reading the text data present in the directories. Each review is present as text file.
if not (os.path.isfile('./dataset/aclImdb/train.csv' and 
                       './dataset/aclImdb/test.csv')):
    path = './dataset/aclImdb/'
    train_text = []
    train_label = []
    test_text = []
    test_label = []
    train_data_path_pos = os.path.join(path,'train/pos/')
    train_data_path_neg = os.path.join(path,'train/neg/')

    for data in ['train','test']:
        for label in ['pos','neg']:
            for file in sorted(os.listdir(os.path.join(path,data,label))):
                if file.endswith('.txt'):
                    with open(os.path.join(path,data,label,file)) as file_data:
                        if data=='train':
                            train_text.append(file_data.read())
                            train_label.append( 1 if label== 'pos' else 0)
                        else :
                            test_text.append(file_data.read())
                            test_label.append( 1 if label== 'pos' else 0)

    train_df = pd.DataFrame({'Review': train_text, 'Label': train_label})
    test_df = pd.DataFrame({'Review': test_text, 'Label': test_label})
    train_df = train_df.sample(frac=1).reset_index(drop=True)
    test_df = test_df.sample(frac=1).reset_index(drop=True)
    
    train_df.to_csv('./dataset/aclImdb/train.csv')
    test_df.to_csv('./dataset/aclImdb/test.csv')
    
else:
    train_df = pd.read_csv('./dataset/aclImdb/train.csv',index_col=0)
    test_df = pd.read_csv('./dataset/aclImdb/test.csv',index_col=0)
    
print('The shape of train data:',train_df.shape)
print('The shape of test data:', test_df.shape)

The shape of train data: (25000, 2)
The shape of test data: (25000, 2)


In [13]:
train_df.head(5)

Unnamed: 0,Review,Label
0,I stumbled across this (Act-I) by pure dumb lu...,1
1,Alone In The Dark is one BAD movie and tied wi...,0
2,Much in the same way Frank Miller and his Sin ...,1
3,"""Bullfighter"" was made in 2000 but it is being...",0
4,"Weak Bobby ""Pineapple Salsa"" Flay and Mario Ba...",0


In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  25000 non-null  object
 1   Label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB
