<a href="https://colab.research.google.com/github/uditkrishna7/Brainwave_Matrix_Internship/blob/master/Data1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import pandas as pd
import numpy as np
import nltk
import re
import string
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# 🔹 Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# ✅ Define Paths
BASE_DIR = r"C:\Users\ECC4016\MindFusion_Analyzer\Datasets"
DATA_FILE = os.path.join(BASE_DIR, "/Combined Data.csv")  # Your dataset path
PROCESSED_FILE = os.path.join(BASE_DIR, "processed_dataset.csv")
MODEL_FILE = os.path.join(BASE_DIR, "logistic_regression_model.pkl")
VECTORIZER_FILE = os.path.join(BASE_DIR, "tfidf_vectorizer.pkl")

# ✅ Ensure Directories Exist
os.makedirs(BASE_DIR, exist_ok=True)

# ✅ Load Dataset
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"Dataset not found at {DATA_FILE}. Please check the file path.")

df = pd.read_csv(DATA_FILE)
print("✅ Dataset Loaded Successfully.\n")

# 🔍 Check for missing values
print("🔍 Checking for missing values:")
print(df.isnull().sum(), "\n")

# ✅ Drop missing values
df.dropna(inplace=True)

# ✅ Text Preprocessing
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

# ✅ Apply text preprocessing
df['cleaned_statement'] = df['statement'].astype(str).apply(clean_text)
print("✅ Text Preprocessing Completed.\n")

# 📊 Label Encoding
label_mapping = {label: idx for idx, label in enumerate(df['status'].unique())}
df['status_encoded'] = df['status'].map(label_mapping)

# 📊 Label distribution before balancing
print("📊 Label distribution before balancing:")
print(df['status'].value_counts(), "\n")

# ✅ Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # Reduced features for better speed
X = vectorizer.fit_transform(df['cleaned_statement'])
y = df['status_encoded']

# ✅ Save the vectorizer
joblib.dump(vectorizer, VECTORIZER_FILE)

# ✅ SMOTE for balancing the dataset
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_resampled, y_resampled = smote.fit_resample(X, y)

print("📊 Label distribution after SMOTE balancing:")
print(pd.Series(y_resampled).value_counts(), "\n")

# ✅ Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# ✅ Train or Load Model
if os.path.exists(MODEL_FILE):
    print("\n🔄 Loading existing trained model...")
    model = joblib.load(MODEL_FILE)
else:
    print("\n⚡ Training Logistic Regression Model...")
    model = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
    model.fit(X_train, y_train)
    joblib.dump(model, MODEL_FILE)  # Save model to avoid retraining
    print("\n✅ Model Training Completed & Saved.")

# ✅ Evaluate Model
y_pred = model.predict(X_test)

print("\n📈 Model Evaluation:")
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"✅ Precision (Weighted): {accuracy_score(y_test, y_pred):.2f}")

# 📄 Classification Report
print("\n📄 Classification Report:")
print(classification_report(y_test, y_pred))

# ✅ Save Processed Dataset
df.to_csv(PROCESSED_FILE, index=False)
print(f"\n✅ Processed dataset saved at: {PROCESSED_FILE}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ Dataset Loaded Successfully.

🔍 Checking for missing values:
Unnamed: 0      0
statement     362
status          0
dtype: int64 



LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [3]:
import os
import pandas as pd
import numpy as np
import nltk
import re
import string
import joblib
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# 🔹 Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Download the punkt_tab data

# ✅ Define Paths
BASE_DIR = r"C:\Users\ECC4016\MindFusion_Analyzer\Datasets"
DATA_FILE = os.path.join(BASE_DIR, "/Combined Data.csv")  # Your dataset path
PROCESSED_FILE = os.path.join(BASE_DIR, "processed_dataset.csv")
MODEL_FILE = os.path.join(BASE_DIR, "logistic_regression_model.pkl")
VECTORIZER_FILE = os.path.join(BASE_DIR, "tfidf_vectorizer.pkl")

# ✅ Ensure Directories Exist
os.makedirs(BASE_DIR, exist_ok=True)

# ✅ Load Dataset
if not os.path.exists(DATA_FILE):
    raise FileNotFoundError(f"Dataset not found at {DATA_FILE}. Please check the file path.")

df = pd.read_csv(DATA_FILE)
print("✅ Dataset Loaded Successfully.\n")

# 🔍 Check for missing values
print("🔍 Checking for missing values:")
print(df.isnull().sum(), "\n")

# ✅ Drop missing values
df.dropna(inplace=True)

# ✅ Text Preprocessing
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenization
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

# ✅ Apply text preprocessing
df['cleaned_statement'] = df['statement'].astype(str).apply(clean_text)
print("✅ Text Preprocessing Completed.\n")

# 📊 Label Encoding
label_mapping = {label: idx for idx, label in enumerate(df['status'].unique())}
df['status_encoded'] = df['status'].map(label_mapping)

# 📊 Label distribution before balancing
print("📊 Label distribution before balancing:")
print(df['status'].value_counts(), "\n")

# ✅ Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  # Reduced features for better speed
X = vectorizer.fit_transform(df['cleaned_statement'])
y = df['status_encoded']

# ✅ Save the vectorizer
joblib.dump(vectorizer, VECTORIZER_FILE)

# ✅ SMOTE for balancing the dataset
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_resampled, y_resampled = smote.fit_resample(X, y)

print("📊 Label distribution after SMOTE balancing:")
print(pd.Series(y_resampled).value_counts(), "\n")

# ✅ Split Data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# ✅ Train or Load Model
if os.path.exists(MODEL_FILE):
    print("\n🔄 Loading existing trained model...")
    model = joblib.load(MODEL_FILE)
else:
    print("\n⚡ Training Logistic Regression Model...")
    model = LogisticRegression(max_iter=500, solver='lbfgs', random_state=42)
    model.fit(X_train, y_train)
    joblib.dump(model, MODEL_FILE)  # Save model to avoid retraining
    print("\n✅ Model Training Completed & Saved.")

# ✅ Evaluate Model
y_pred = model.predict(X_test)

print("\n📈 Model Evaluation:")
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"✅ Precision (Weighted): {accuracy_score(y_test, y_pred):.2f}")

# 📄 Classification Report
print("\n📄 Classification Report:")
print(classification_report(y_test, y_pred))

# ✅ Save Processed Dataset
df.to_csv(PROCESSED_FILE, index=False)
print(f"\n✅ Processed dataset saved at: {PROCESSED_FILE}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


✅ Dataset Loaded Successfully.

🔍 Checking for missing values:
Unnamed: 0      0
statement     362
status          0
dtype: int64 

✅ Text Preprocessing Completed.

📊 Label distribution before balancing:
status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64 

📊 Label distribution after SMOTE balancing:
status_encoded
0    16343
1    16343
2    16343
3    16343
4    16343
5    16343
6    16343
Name: count, dtype: int64 


⚡ Training Logistic Regression Model...

✅ Model Training Completed & Saved.

📈 Model Evaluation:
✅ Accuracy: 0.81
✅ Precision (Weighted): 0.81

📄 Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      3375
           1       0.69      0.82      0.75      3325
           2       0.72      0.61      0.66      3271

# New Section