In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
import tensorflow as tf

In [None]:
def no_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def lowercase(text):
    return text.lower()


def stem_words(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in text]

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]


In [None]:
def text_preprocessing(text):
    text = no_punctuation(text)
    text = lowercase(text)
    text = stem_words(text)
    return text

df = pd.read_excel('/content/ThemeData.xlsx')

In [None]:
df.head()

Unnamed: 0,Project Name,Theme
0,10 kWp Solar PV Power Plant at Admini Block,Environment and Sustainable Development Studies
1,1000 LPD Solar Water Heater at student home,Environment and Sustainable Development Studies
2,10m3 night soil based Bio gas plant at student...,Environment and Sustainable Development Studies
3,1KW DFIG SETUP WITH IGBT BASED CONVERTER - INV...,Engineering and Technology
4,2009 Ethno Veterinary Use of Medicinal plants ...,Health and Medicine


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Project Name'], df['Theme'], test_size=0.10, random_state=42)

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l1', 'l2']
}

clf = LogisticRegression()

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

print('Best hyperparameters:', grid_search.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best hyperparameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}


90 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 67, in _check_solver
   

In [None]:
clf = LogisticRegression(
    C=grid_search.best_params_['C'],
    solver=grid_search.best_params_['solver'],
    penalty=grid_search.best_params_['penalty']
)

clf.fit(X_train_tfidf, y_train)

In [None]:
y_pred = clf.predict(X_test_tfidf)

accuracy = np.mean(y_pred == y_test)

print(f'Accuracy on test set: {accuracy:.2f}')

Accuracy on test set: 0.62


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred, average='micro')
print('Precision:', precision)

# Calculate recall
recall = recall_score(y_test, y_pred, average='micro')
print('Recall:', recall)

# Calculate F1
f1 = f1_score(y_test, y_pred, average='micro')
print('F1:', f1)

Precision: 0.6177606177606177
Recall: 0.6177606177606177
F1: 0.6177606177606177


In [None]:
# Prepare the input data
new_title = "Corona Virus and its "

# Preprocess the input data
new_title_preprocessed = text_preprocessing(new_title)

# Transform the input data
new_title_tfidf = vectorizer.transform([new_title])

# Make a prediction
prediction = clf.predict(new_title_tfidf)

# Print the prediction
print(prediction)

['Engineering and Technology']
