In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
#Downloading necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kalyankumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kalyankumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kalyankumar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#Step 1: Loading the datasets
#Ensuring that the dataset has a label column indicating 'machine-generated' or 'user-generated'
train_df = pd.read_csv('/Users/kalyankumar/Downloads/archive (2)/CSV/train.csv')
val_df = pd.read_csv('/Users/kalyankumar/Downloads/archive (2)/CSV/validation.csv')
test_df = pd.read_csv('/Users/kalyankumar/Downloads/archive (2)/CSV/test.csv')

In [4]:
#Step 2: Handling Missing Data
def handle_missing_data(df, text_column):
    print(f"Missing values in '{text_column}' column before cleaning: {df[text_column].isnull().sum()}")
    df = df.dropna(subset=[text_column])  # Drop rows with missing text data
    print(f"Missing values in '{text_column}' column after cleaning: {df[text_column].isnull().sum()}")
    return df

# Applying missing data handling to each dataset
train_df = handle_missing_data(train_df, 'dialogue')
val_df = handle_missing_data(val_df, 'dialogue')
test_df = handle_missing_data(test_df, 'dialogue')

Missing values in 'dialogue' column before cleaning: 0
Missing values in 'dialogue' column after cleaning: 0
Missing values in 'dialogue' column before cleaning: 0
Missing values in 'dialogue' column after cleaning: 0
Missing values in 'dialogue' column before cleaning: 0
Missing values in 'dialogue' column after cleaning: 0


In [5]:
#Step 3: Text Cleaning
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Removing non-alphabetical characters
    text = text.strip()  # Removing leading and trailing whitespace
    return text

In [6]:
#Step 4: Preprocessing Text
def preprocess_text(text):

#Text cleaning
    text = clean_text(text)
    
#Tokenization and Lowercasing
    tokens = word_tokenize(text.lower())  # Tokenizing and convert to lowercase
    
#Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
#Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)  # Joining tokens back into a single string

In [7]:
#Step 5: Full Preprocessing Workflow for each dataset
def preprocess_dataset(df, text_column):
#Handling missing data
    df = handle_missing_data(df, text_column)
    
#Applying text preprocessing
    df['processed_text'] = df[text_column].apply(preprocess_text)
    
    return df

# Apply preprocessing to each dataset
train_df = preprocess_dataset(train_df, 'dialogue')
val_df = preprocess_dataset(val_df, 'dialogue')
test_df = preprocess_dataset(test_df, 'dialogue')

Missing values in 'dialogue' column before cleaning: 0
Missing values in 'dialogue' column after cleaning: 0
Missing values in 'dialogue' column before cleaning: 0
Missing values in 'dialogue' column after cleaning: 0
Missing values in 'dialogue' column before cleaning: 0
Missing values in 'dialogue' column after cleaning: 0


In [8]:
print("Training Data:")
print(train_df[['dialogue', 'processed_text']].head())

Training Data:
                                            dialogue  \
0  #Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...   
1  #Person1#: Hello Mrs. Parker, how have you bee...   
2  #Person1#: Excuse me, did you see a set of key...   
3  #Person1#: Why didn't you tell me you had a gi...   
4  #Person1#: Watsup, ladies! Y'll looking'fine t...   

                                      processed_text  
0  person hi mr smith im doctor hawkins today per...  
1  person hello mr parker person hello dr peter f...  
2  person excuse see set key person kind key pers...  
3  person didnt tell girlfriend person sorry thou...  
4  person watsup lady yll lookingfine tonight may...  


In [9]:
print("\nValidation Data:")
print(val_df[['dialogue', 'processed_text']].head())


Validation Data:
                                            dialogue  \
0  #Person1#: Hello, how are you doing today?\n#P...   
1  #Person1#: Hey Jimmy. Let's go workout later t...   
2  #Person1#: I need to stop eating such unhealth...   
3  #Person1#: Do you believe in UFOs?\n#Person2#:...   
4  #Person1#: Did you go to school today?\n#Perso...   

                                      processed_text  
0  person hello today person trouble breathing la...  
1  person hey jimmy let go workout later today pe...  
2  person need stop eating unhealthy food person ...  
3  person believe ufo person course person never ...  
4  person go school today person course person di...  


In [10]:
print("\nTest Data:")
print(test_df[['dialogue', 'processed_text']].head())


Test Data:
                                            dialogue  \
0  #Person1#: Ms. Dawson, I need you to take a di...   
1  #Person1#: Ms. Dawson, I need you to take a di...   
2  #Person1#: Ms. Dawson, I need you to take a di...   
3  #Person1#: You're finally here! What took so l...   
4  #Person1#: You're finally here! What took so l...   

                                      processed_text  
0  person m dawson need take dictation person yes...  
1  person m dawson need take dictation person yes...  
2  person m dawson need take dictation person yes...  
3  person youre finally took long person got stuc...  
4  person youre finally took long person got stuc...  


In [11]:
#Exporting the datasets individually

train_df.to_csv('/Users/kalyankumar/Downloads/train_processed.csv', index=False)
val_df.to_csv('/Users/kalyankumar/Downloads/val_processed.csv', index=False)
test_df.to_csv('/Users/kalyankumar/Downloads/test_processed.csv', index=False)