In [1]:
# ! pip install -U spacy

In [2]:
# ! spacy download en_core_web_sm

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import spacy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
# Let's covvert words to numbers using TF-IDF 
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Dense, Flatten, Input
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [4]:
def remove_num_punc(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    return text.lower() 

In [5]:
def lemmitization(text):
    words = ''
    for word in text:
        words += ' ' + word.lemma_
    return words

In [6]:
nlp = spacy.load('en_core_web_sm')

In [7]:
data = pd.read_csv('/kaggle/input/predict-closed-questions-on-stack-overflow/train-sample.csv')

In [8]:
data.head()

Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus
0,6046168,05/18/2011 14:14:05,543315,09/17/2010 10:15:06,1,2,For Mongodb is it better to reference an objec...,I am building a corpus of indexed sentences in...,mongodb,,,,,,open
1,4873911,02/02/2011 11:30:10,465076,10/03/2010 09:30:58,192,24,How to insert schemalocation in a xml document...,i create a xml document with JAXP and search a...,dom,xsd,jaxp,,,,open
2,3311559,07/22/2010 17:21:54,406143,07/22/2010 16:58:20,1,0,Too many lookup tables,What are the adverse effects of having too man...,sql-server,database-design,enums,,,,open
3,9990413,04/03/2012 09:18:39,851755,07/19/2011 10:22:40,4,1,What is this PHP code in VB.net,I am looking for the vb.net equivalent of this...,php,vb.net,,,,04/15/2012 21:12:48,too localized
4,10421966,05/02/2012 21:25:01,603588,02/04/2011 18:05:34,334,14,Spring-Data mongodb querying multiple classes ...,"With Spring-Data, you can use the @Document an...",mongodb,spring-data,,,,,open


In [9]:
data.shape

(140272, 15)

In [10]:
data = data.sample(80000, random_state = 234)
data.head()

Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus
122715,1296097,08/18/2009 19:51:02,135646,07/09/2009 13:13:22,146,13,How to serialize Color property as ARGB values?,I'm working with Windows Forms designer. It se...,winforms,windows-form-designer,colors,rgb,serialization,,open
29350,9349765,02/19/2012 13:52:14,1216512,02/17/2012 14:57:14,1,0,Sending large files in java,How to send large files (2-3 GB) over the netw...,networking,,,,,02/20/2012 14:17:34,not a real question
6938,10974978,06/11/2012 05:47:32,748164,05/11/2011 07:12:53,44,1,How to Create personal online radio station?,"I want to create my own radio station, that is...",asp.net,,,,,06/11/2012 07:35:28,off topic
131897,11364721,07/06/2012 15:00:12,623694,02/18/2011 19:35:31,677,32,how to name my android software legally?,Im new to android programming and i found a pr...,android,android-market,google-play,,,07/06/2012 15:54:05,off topic
10420,11076945,06/18/2012 04:31:24,1314162,04/05/2012 01:24:25,1,0,Broke Emacs 24 on Lion 10.7.4,I just managed to break my beloved Emacs on Li...,emacs,osx-lion,macports,ncurses,,06/25/2012 15:57:29,off topic


In [11]:
data.isnull().sum()

PostId                                     0
PostCreationDate                           0
OwnerUserId                                0
OwnerCreationDate                          0
ReputationAtPostCreation                   0
OwnerUndeletedAnswerCountAtPostTime        0
Title                                      0
BodyMarkdown                               0
Tag1                                       6
Tag2                                   15642
Tag3                                   36753
Tag4                                   57231
Tag5                                   71055
PostClosedDate                         40024
OpenStatus                                 0
dtype: int64

In [12]:
data = data[['Title', 'BodyMarkdown', 'OpenStatus']]

In [13]:
data.head()

Unnamed: 0,Title,BodyMarkdown,OpenStatus
122715,How to serialize Color property as ARGB values?,I'm working with Windows Forms designer. It se...,open
29350,Sending large files in java,How to send large files (2-3 GB) over the netw...,not a real question
6938,How to Create personal online radio station?,"I want to create my own radio station, that is...",off topic
131897,how to name my android software legally?,Im new to android programming and i found a pr...,off topic
10420,Broke Emacs 24 on Lion 10.7.4,I just managed to break my beloved Emacs on Li...,off topic


In [14]:
data = data.dropna()

In [15]:
data.isnull().sum()

Title           0
BodyMarkdown    0
OpenStatus      0
dtype: int64

In [16]:
# data = data[:10000]

In [17]:
# data['Title'] = data['Title'].apply(lambda x: nlp(x))
# data['BodyMarkdown'] = data['BodyMarkdown'].apply(lambda x: nlp(x))

In [18]:
data['Title'] = data['Title'].apply(lambda x: remove_num_punc(x))
data['BodyMarkdown'] = data['BodyMarkdown'].apply(lambda x: remove_num_punc(x))

In [19]:
# data['Title'] = data['Title'].apply(lambda x: lemmitization(x))
# data['BodyMarkdown'] = data['BodyMarkdown'].apply(lambda x: lemmitization(x))

In [20]:
data.head()

Unnamed: 0,Title,BodyMarkdown,OpenStatus
122715,how to serialize color property as argb values,i m working with windows forms designer it se...,open
29350,sending large files in java,how to send large files gb over the netw...,not a real question
6938,how to create personal online radio station,i want to create my own radio station that is...,off topic
131897,how to name my android software legally,im new to android programming and i found a pr...,off topic
10420,broke emacs on lion,i just managed to break my beloved emacs on li...,off topic


In [21]:
encoder = LabelEncoder()
# data['Tag1'] = encoder.fit_transform(data['Tag1'])
data['OpenStatus'] = encoder.fit_transform(data['OpenStatus'])

In [22]:
data.head()

Unnamed: 0,Title,BodyMarkdown,OpenStatus
122715,how to serialize color property as argb values,i m working with windows forms designer it se...,3
29350,sending large files in java,how to send large files gb over the netw...,0
6938,how to create personal online radio station,i want to create my own radio station that is...,2
131897,how to name my android software legally,im new to android programming and i found a pr...,2
10420,broke emacs on lion,i just managed to break my beloved emacs on li...,2


In [23]:
target = data['OpenStatus']
data = data.drop(['OpenStatus'], axis=1)

In [24]:
data.head()

Unnamed: 0,Title,BodyMarkdown
122715,how to serialize color property as argb values,i m working with windows forms designer it se...
29350,sending large files in java,how to send large files gb over the netw...
6938,how to create personal online radio station,i want to create my own radio station that is...
131897,how to name my android software legally,im new to android programming and i found a pr...
10420,broke emacs on lion,i just managed to break my beloved emacs on li...


In [25]:
data = data['Title'] + ' '+ data['BodyMarkdown']

In [26]:
xtrain, xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3, random_state = 42)
data = []

In [27]:
xtrain.shape

(56000,)

In [28]:
vectorizer = TfidfVectorizer(max_features = 1000)  # it contains only 10k features (fixed!)

xtrain = vectorizer.fit_transform(xtrain).toarray()  # converting words to numbers for train data 
xtest = vectorizer.transform(xtest).toarray()        # converting words to numbers for test data 

In [29]:
xtrain.shape

(56000, 1000)

In [30]:
set(ytrain)

{0, 1, 2, 3, 4}

In [31]:
from sklearn.neural_network import MLPClassifier

mlp_cv=MLPClassifier(early_stopping=True, verbose=2)
mlp_cv.fit(xtrain, ytrain)

Iteration 1, loss = 1.14862808
Validation score: 0.620893
Iteration 2, loss = 0.96125810
Validation score: 0.619643
Iteration 3, loss = 0.93586758
Validation score: 0.617143
Iteration 4, loss = 0.92322745
Validation score: 0.622143
Iteration 5, loss = 0.91252599
Validation score: 0.620536
Iteration 6, loss = 0.90255585
Validation score: 0.622500
Iteration 7, loss = 0.89152095
Validation score: 0.621071
Iteration 8, loss = 0.87954500
Validation score: 0.622321
Iteration 9, loss = 0.86727904
Validation score: 0.628036
Iteration 10, loss = 0.85501732
Validation score: 0.625357
Iteration 11, loss = 0.84215464
Validation score: 0.621071
Iteration 12, loss = 0.82986132
Validation score: 0.623036
Iteration 13, loss = 0.81722133
Validation score: 0.622321
Iteration 14, loss = 0.80393377
Validation score: 0.622500
Iteration 15, loss = 0.79095773
Validation score: 0.621429
Iteration 16, loss = 0.77854277
Validation score: 0.619821
Iteration 17, loss = 0.76517680
Validation score: 0.615714
Iterat

MLPClassifier(early_stopping=True, verbose=2)

In [32]:
predicted_mlp = mlp_cv.predict(xtest)

In [33]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

def metrics(predicted): 
    predicted_naive = predicted 
    print('Accuracy Score \n',accuracy_score(predicted_naive, ytest))
    print('Confusion Matrix \n', confusion_matrix(predicted_naive, ytest))
    print('Classification Report \n', classification_report(predicted_naive, ytest))

In [34]:
metrics(predicted_mlp)

Accuracy Score 
 0.6262083333333334
Confusion Matrix 
 [[2210  303  354 1123  217]
 [ 406 1582  448  438   29]
 [ 298  270 1274  478   61]
 [2295  521  960 9946  732]
 [  19    1    2   16   17]]
Classification Report 
               precision    recall  f1-score   support

           0       0.42      0.53      0.47      4207
           1       0.59      0.54      0.57      2903
           2       0.42      0.54      0.47      2381
           3       0.83      0.69      0.75     14454
           4       0.02      0.31      0.03        55

    accuracy                           0.63     24000
   macro avg       0.46      0.52      0.46     24000
weighted avg       0.69      0.63      0.65     24000



In [35]:
from sklearn.svm import SVC
model = SVC()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.6317916666666666

In [36]:
ann = Sequential()
ann.add(Input(xtrain.shape))
# Add an hidden layer
ann.add(Dense(1024, activation='relu'))
ann.add(Dense(1024, activation='relu'))
ann.add(Dense(512, activation='relu'))
ann.add(Dense(512, activation='relu'))
ann.add(Dense(256, activation='relu'))
ann.add(Dense(128, activation='relu'))
ann.add(Dense(64, activation='relu'))
ann.add(Dense(32, activation='relu'))
ann.add(Dense(16, activation='relu'))
# softmax normalizes it into a probability distribution consisting of
# K probabilities proportional to the exponentials of the input
# Add an output layer
ann.add(Dense(5, activation='softmax'))

2023-02-10 21:29:14.024846: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 21:29:14.025767: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 21:29:14.026900: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 21:29:14.027652: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 21:29:14.028337: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [37]:
ann.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 56000, 1024)       1025024   
_________________________________________________________________
dense_1 (Dense)              (None, 56000, 1024)       1049600   
_________________________________________________________________
dense_2 (Dense)              (None, 56000, 512)        524800    
_________________________________________________________________
dense_3 (Dense)              (None, 56000, 512)        262656    
_________________________________________________________________
dense_4 (Dense)              (None, 56000, 256)        131328    
_________________________________________________________________
dense_5 (Dense)              (None, 56000, 128)        32896     
_________________________________________________________________
dense_6 (Dense)              (None, 56000, 64)         8

In [38]:
ann.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [39]:
early_stopping = EarlyStopping(patience=10, verbose=1)
model_checkpoint = ModelCheckpoint("best_model.h5", save_best_only=True, verbose=1)

In [40]:
ann.fit(xtrain, ytrain, epochs=50, batch_size=32, validation_data=(xtest, ytest), callbacks=[model_checkpoint])

2023-02-10 21:29:26.163836: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50

Epoch 00001: val_loss improved from inf to 1.04618, saving model to best_model.h5
Epoch 2/50

Epoch 00002: val_loss improved from 1.04618 to 1.00771, saving model to best_model.h5
Epoch 3/50

Epoch 00003: val_loss did not improve from 1.00771
Epoch 4/50

Epoch 00004: val_loss did not improve from 1.00771
Epoch 5/50

Epoch 00005: val_loss did not improve from 1.00771
Epoch 6/50

Epoch 00006: val_loss did not improve from 1.00771
Epoch 7/50

Epoch 00007: val_loss did not improve from 1.00771
Epoch 8/50

Epoch 00008: val_loss did not improve from 1.00771
Epoch 9/50

Epoch 00009: val_loss did not improve from 1.00771
Epoch 10/50

Epoch 00010: val_loss did not improve from 1.00771
Epoch 11/50

Epoch 00011: val_loss did not improve from 1.00771
Epoch 12/50

Epoch 00012: val_loss did not improve from 1.00771
Epoch 13/50

Epoch 00013: val_loss did not improve from 1.00771
Epoch 14/50

Epoch 00014: val_loss did not improve from 1.00771
Epoch 15/50

Epoch 00015: val_loss did not impr

<keras.callbacks.History at 0x7f37273b6290>

In [41]:
ann.save_weights('stackoverflow_weights.h5')

In [42]:
# prediction = ann.predict(x_test)
score = ann.evaluate(xtest, ytest, verbose=0)
print("accuracy ANN", score[1] * 100)
# prediction = np.argmax(prediction, axis=1)
# y_test = np.argmax(y_test, axis=1)

accuracy ANN 58.45833420753479
