In [1]:
import pandas as pd

df = pd.read_csv('merged_adf_df_with_lda_topic1.csv', index_col=0)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize a TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer and transform 'LemmatizedQuestionBody' into a feature matrix X
X = vectorizer.fit_transform(df['LemmatizedQuestionBody'])

# The target remains the same
y = df['LDATopic']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a logistic regression model
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
logreg.fit(X_train, y_train)

# Make predictions and print a classification report
y_pred = logreg.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.46      0.57       412
          11       0.85      0.95      0.90      1298

    accuracy                           0.83      1710
   macro avg       0.80      0.71      0.73      1710
weighted avg       0.83      0.83      0.82      1710



In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

# Same setup as before...

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[ 189  223]
 [  61 1237]]


In [4]:
# Feature importance
feature_names = vectorizer.get_feature_names_out()
coefs_with_fns = sorted(zip(logreg.coef_[0], feature_names))
top = zip(coefs_with_fns[:10], coefs_with_fns[:-(10 + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
    print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))



	-5.8374	dataflow       		3.6739	azure          
	-5.4433	flow           		3.0381	factory        
	-4.3436	following      		2.2479	storage        
	-3.4606	mapping        		2.1371	service        
	-3.0339	pipeline       		2.0850	datafactory    
	-2.9945	column         		2.0760	blob           
	-2.6066	sink           		1.8017	file           
	-2.5790	transformation 		1.6949	adf            
	-2.5068	value          		1.6781	activity       
	-2.0134	array          		1.6402	server         


In [5]:
# Cross-validation
scores = cross_val_score(logreg, X, y, cv=5)
print("Cross-validation scores: ", scores)
print("Average cross-validation score: ", scores.mean())

Cross-validation scores:  [0.82631579 0.81637427 0.82913985 0.78057343 0.82036279]
Average cross-validation score:  0.8145532252710966
