In [1]:
#pip install kagglehub

In [2]:
import pandas as pd
import kagglehub
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download latest version
path = kagglehub.dataset_download("prakharrathi25/reddit-data-huge")

In [4]:
# List all CSV files in the folder
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]

# Initialize a list to hold DataFrames
dataframes = []

# Read each CSV file into a DataFrame and add it to the list
for csv_file in csv_files:
    file_path = os.path.join(path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Optionally, combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37580 entries, 0 to 37579
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     34712 non-null  object 
 1   ID             34712 non-null  object 
 2   is_Original    26311 non-null  object 
 3   Flair          19587 non-null  object 
 4   num_comments   29179 non-null  float64
 5   Title          29179 non-null  object 
 6   Subreddit      37580 non-null  object 
 7   Body           16405 non-null  object 
 8   URL            29179 non-null  object 
 9   Upvotes        37580 non-null  int64  
 10  Comments       27163 non-null  object 
 11  creation_date  37580 non-null  object 
 12  is_original    2868 non-null   object 
 13  Text           8400 non-null   object 
 14  Sentiment      8401 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 4.3+ MB


In [8]:
import re
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter




# Data Cleaning and Preprocessing without NLTK
def preprocess_text(text):
    # Convert to lowercase
    if isinstance(text, float):
        text = str(text)
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize words
    words = text.split()
    # Remove stopwords using sklearn's ENGLISH_STOP_WORDS
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return ' '.join(words)

# Apply preprocessing to the text data
combined_df['Text'] = (combined_df['Title'] + ' ' + 
                                 combined_df['Body'].fillna('')).apply(preprocess_text)



In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

# Encode Subreddit labels
label_encoder = LabelEncoder()
combined_df['Label'] = label_encoder.fit_transform(combined_df['Subreddit'])

# Split data
X = combined_df['Text']
y = combined_df['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF and Logistic Regression
pipeline = make_pipeline(
    TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 2), max_features=10000, sublinear_tf=True),
    LogisticRegression(solver='lbfgs',max_iter=500)
)

# Grid Search for Hyperparameter Tuning
param_grid = {
    'logisticregression__C': [0.1, 1, 10, 100],  # Regularization strength
    'logisticregression__solver': ['liblinear', 'saga']  # Solver type
}
grid = GridSearchCV(pipeline, param_grid, cv=5)
grid.fit(X_train, y_train)

# Evaluate best model
best_model = grid.best_estimator_
accuracy = best_model.score(X_test, y_test)
print(f"Best Model Accuracy: {accuracy}")

Best Model Accuracy: 0.5876796168174561
