### Import Relevant Packages 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

### Import Relevant Packages 

In [2]:
# Replace 'your_file.xlsx' with the path to your Excel file
df = pd.read_excel('Enhanced_Synthetic_Data_1000.xlsx')

# Display the first few rows of the dataset to inspect
print(df.head())

                                       Text Scope
0     How to open a New Workbook in VS Code   Out
1           Analyze a New Workbook in Excel   Out
2             Format a New Workbook in Word    In
3            Share a New Workbook on GitHub    In
4  Organize a New Workbook in Google Sheets   Out


### Splitting Dependant Variable and Independent Variable

In [4]:
X = df['Text']  # Features (text data)
y = df['Scope']  # Target (label: 0 or 1)

### Splitting into Training data and Testing data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### TfIDF Vectorization

In [6]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

### Creation of Logistic Regression Model

In [7]:
log_reg = LogisticRegression(max_iter=1000)

### Training The model

In [8]:
log_reg.fit(X_train_tfidf, y_train)

LogisticRegression(max_iter=1000)

In [9]:
y_pred = log_reg.predict(X_test_tfidf)

### Assessing the quality of the model

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.495


In [11]:
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

          In       0.51      0.48      0.49       103
         Out       0.48      0.52      0.50        97

    accuracy                           0.49       200
   macro avg       0.50      0.50      0.49       200
weighted avg       0.50      0.49      0.49       200



In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
random_forest.fit(X_train_tfidf, y_train)

# Make predictions on the testing data
y_pred = random_forest.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.48


In [16]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

          In       0.49      0.41      0.45       103
         Out       0.47      0.56      0.51        97

    accuracy                           0.48       200
   macro avg       0.48      0.48      0.48       200
weighted avg       0.48      0.48      0.48       200



### Optimizing the model

In [17]:
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, max_iter=1000)

# Train the model with L1 regularization
log_reg_l1.fit(X_train_tfidf, y_train)

# Make predictions
y_pred_l1 = log_reg_l1.predict(X_test_tfidf)

# Evaluate
print("Accuracy with L1 Regularization:", accuracy_score(y_test, y_pred_l1))
print("Classification Report:\n", classification_report(y_test, y_pred_l1))

Accuracy with L1 Regularization: 0.475
Classification Report:
               precision    recall  f1-score   support

          In       0.49      0.44      0.46       103
         Out       0.46      0.52      0.49        97

    accuracy                           0.48       200
   macro avg       0.48      0.48      0.47       200
weighted avg       0.48      0.47      0.47       200



In [18]:
og_reg = LogisticRegression(penalty='l2', C=1.0, max_iter=1000)

# Train the model
log_reg.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.495
Classification Report:
               precision    recall  f1-score   support

          In       0.51      0.48      0.49       103
         Out       0.48      0.52      0.50        97

    accuracy                           0.49       200
   macro avg       0.50      0.50      0.49       200
weighted avg       0.50      0.49      0.49       200

