### Import Relevant Packages

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

### Importing Dataset

In [24]:
# Replace 'your_file.xlsx' with the path to your Excel file
df = pd.read_excel('Enhanced_Synthetic_Data_10000.xlsx')

# Display the first few rows of the dataset to inspect
print(df.head())

                                                Text Scope
0  Best practices for database management for aut...    In
1  Guide to using database management for AI deve...    In
2                     What is SQL queries in finance   Out
3   Best practices for web development for reporting    In
4  Introduction to database management for data p...   Out


### splitting Dependent variable and Independent Variable

In [4]:
X = df['Text']  # Features (text data)
y = df['Scope']  # Target (label: 0 or 1)

### splitting the dataset into training and testing

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### converting text into neumeric - TFIDF vectorization 

In [6]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

### Creating Logistic Regression Model

In [7]:
log_reg = LogisticRegression(max_iter=1000)

### Training the model

In [8]:
log_reg.fit(X_train_tfidf, y_train)

LogisticRegression(max_iter=1000)

### Assessing the quality of the model

In [9]:
y_pred = log_reg.predict(X_test_tfidf)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [15]:
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

          In       1.00      1.00      1.00       989
         Out       1.00      1.00      1.00      1011

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
random_forest.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = random_forest.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


### Prediction for user data

In [17]:
sample_pred = random_forest.predict(X_test_tfidf[0])

In [18]:
sample_pred

array(['In'], dtype=object)

In [51]:
df1 = pd.read_excel('sample.xlsx')

In [52]:
input=df['Text']

In [53]:
input1=tfidf_vectorizer.fit_transform(input)

In [55]:
sample_pred = random_forest.predict(input1[0])

In [56]:
sample_pred

array(['In'], dtype=object)

In [35]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

          In       1.00      1.00      1.00       989
         Out       1.00      1.00      1.00      1011

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



### Applying Regularization Method - Lasso Regression

In [16]:
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, max_iter=1000)

# Train the model with L1 regularization
log_reg_l1.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_l1 = log_reg_l1.predict(X_test_tfidf)

# Evaluate
print("Accuracy with L1 Regularization:", accuracy_score(y_test, y_pred_l1))
print("Classification Report:\n", classification_report(y_test, y_pred_l1))

Accuracy with L1 Regularization: 1.0
Classification Report:
               precision    recall  f1-score   support

          In       1.00      1.00      1.00       989
         Out       1.00      1.00      1.00      1011

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

