# Setup

In [2]:
import sys
import pandas as pd
from google.colab import drive
import numpy as np
import seaborn as sns
import re
import copy
import joblib
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report, f1_score, \
  precision_score, recall_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Content located at https://drive.google.com/drive/folders/1JwGMvRrlptyIfmMSezMPYXGPubGuwG-4?usp=sharing
drive.mount("/content/drive")
data_path = "/content/drive/MyDrive/data/final_project/"


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def disp(df, n=5):
  print(df.shape)
  display(df.head(n))

# Data Prep

In [28]:
questions = pd.read_csv(f"{data_path}Questions.csv", header=0, 
                        encoding="ISO-8859-1", engine="python", 
                        error_bad_lines=False)
cols = {"Id": "id", "Title": "title", "Body": "body"}
questions = questions[cols.keys()].rename(columns=cols)

disp(questions)



  exec(code_obj, self.user_global_ns, self.user_ns)


(1264216, 3)


Unnamed: 0,id,title,body
0,80,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [29]:
tags_long = pd.read_csv(f"{data_path}Tags.csv", header=0, encoding="ISO-8859-1",
                        engine="python", error_bad_lines=False)
valid_tags = ["javascript", "java", "c#", "php", "python", "html", "c++", "sql", "c", "r"]
tags_long = tags_long[tags_long["Tag"].isin(valid_tags)]

tags = tags_long.groupby("Id").agg(tuple).applymap(list).reset_index()
tags = tags.rename(columns={"Id": "id", "Tag": "tag"})
tags = tags[tags["tag"].str.len() == 1]
tags["tag"] = tags["tag"].apply(lambda x: x[0])

disp(tags)



  exec(code_obj, self.user_global_ns, self.user_ns)


(569098, 2)


Unnamed: 0,id,tag
0,120,sql
1,260,c#
2,330,c++
3,650,c#
4,930,c#


In [30]:
tags["tag"].value_counts()

java          109513
c#             94389
javascript     90801
php            80243
python         61210
c++            42357
sql            28133
html           27935
c              19337
r              15180
Name: tag, dtype: int64

In [31]:
df = questions.merge(tags, how="inner", on="id")
df = df.groupby("tag").sample(n=15000, random_state=314).reset_index(drop=True)

del questions, tags_long, tags

disp(df)

(150000, 4)


Unnamed: 0,id,title,body,tag
0,25600370,Data Validation in C when the user inputs an i...,<p>How do I ensure that the user inputs only a...,c
1,35082100,c malloc functionality for custom memory region,<p>Is there any malloc/realloc/free like imple...,c
2,18968070,Error when defining a stringising macro with _...,<p>I have been trying to implement a function ...,c
3,24130610,"Valgrind ""Conditional jump or move depends on ...",<p>I'm very new to C programming (my first que...,c
4,26009570,What improvements does GCC's `__builtin_malloc...,<p>I have recently been made aware of GCC's bu...,c


# Preprocessing

In [10]:
def process_text(text):
    text = re.sub("[!\"#$%&'()*+,-.\/:;<=>?@\[\]^_`{|}~]", " ", text) # Punctuation
    text = text.lower()

    stop = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop]) # Stopwords

    return text

In [35]:
df['title_proc'] = df['title'].apply(process_text)
df['body_proc'] = df['body'].apply(process_text)

disp(df)

(150000, 6)


Unnamed: 0,id,title,body,tag,title_proc,body_proc
0,25600370,Data Validation in C when the user inputs an i...,<p>How do I ensure that the user inputs only a...,c,data validation c user inputs incorrect value ...,p ensure user inputs integer value program won...
1,35082100,c malloc functionality for custom memory region,<p>Is there any malloc/realloc/free like imple...,c,c malloc functionality custom memory region,p malloc realloc free like implementation spec...
2,18968070,Error when defining a stringising macro with _...,<p>I have been trying to implement a function ...,c,error defining stringising macro va args,p trying implement function macro c prepends d...
3,24130610,"Valgrind ""Conditional jump or move depends on ...",<p>I'm very new to C programming (my first que...,c,valgrind conditional jump move depends uniniti...,p new c programming first question stackoverfl...
4,26009570,What improvements does GCC's `__builtin_malloc...,<p>I have recently been made aware of GCC's bu...,c,improvements gcc builtin malloc provide plain ...,p recently made aware gcc built functions c li...


In [75]:
i = 2

cond = (df["tag"] == "c#")

print(f"----RAW TITLE:")
print(df.loc[cond, "title"].iloc[i])

print(f"\n----RAW BODY:")
print(df.loc[cond, "body"].iloc[i])

print(f"\n----PROCESSED TITLE:")
print(df.loc[cond, "title_proc"].iloc[i])

print(f"\n----PROCESSED BODY:")
print(df.loc[cond, "body_proc"].iloc[i])

print(f"\n----TAG:")
print(df.loc[cond, "tag"].iloc[i])

----RAW TITLE:
C# skipping first line of a text file

----RAW BODY:
<p>here is the code i'm using</p>

<pre><code>        using (FileStream fs = new FileStream(filename, FileMode.Open))
        using (StreamReader rdr = new StreamReader(fs))
        {
            while (!rdr.EndOfStream)
            {
                for (int z = 0; z &lt; 2; z++)
                {

                    string[] lines = rdr.ReadLine().Split('|');
                    {
                        sb.AppendLine(";Re");
                        sb.AppendLine("@C PAMT " + lines[3]);
                        sb.AppendLine("@T " + lines[0]);
                        sb.AppendLine("@D @I\\" + lines[1]).Replace("I:\\", "");
                        sb.AppendLine(lines[2].Replace(";", "\r\n");
                    }
                }
            }
        }

        using (FileStream fs = new FileStream(outputfilename, FileMode.Create))
        using (StreamWriter writer = new StreamWriter(fs))
        {
            writ

In [37]:
df = df[~df["title_proc"].isnull()]
df = df[~df["body_proc"].isnull()]

df_copy = copy.copy(df)

In [38]:
n_features = 2000

matrix_body = CountVectorizer(max_features=n_features)
X_body = pd.DataFrame(matrix_body.fit_transform(df["body_proc"]).toarray())
X_body.columns = [f"X_body_{i}" for i in X_body.columns]

matrix_title = CountVectorizer(max_features=n_features)
X_title = pd.DataFrame(matrix_title.fit_transform(df["title_proc"]).toarray())
X_title.columns = [f"X_title_{i}" for i in X_title.columns]

df = pd.concat([df, X_body, X_title], axis=1)
del X_body, X_title

disp(df, 1)

(150000, 4006)


Unnamed: 0,id,title,body,tag,title_proc,body_proc,X_body_0,X_body_1,X_body_2,X_body_3,...,X_title_1990,X_title_1991,X_title_1992,X_title_1993,X_title_1994,X_title_1995,X_title_1996,X_title_1997,X_title_1998,X_title_1999
0,25600370,Data Validation in C when the user inputs an i...,<p>How do I ensure that the user inputs only a...,c,data validation c user inputs incorrect value ...,p ensure user inputs integer value program won...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
Y = pd.get_dummies(df["tag"])
df = pd.concat([df, Y], axis=1)
del Y

disp(df, 1)

(150000, 4016)


Unnamed: 0,id,title,body,tag,title_proc,body_proc,X_body_0,X_body_1,X_body_2,X_body_3,...,c,c#,c++,html,java,javascript,php,python,r,sql
0,25600370,Data Validation in C when the user inputs an i...,<p>How do I ensure that the user inputs only a...,c,data validation c user inputs incorrect value ...,p ensure user inputs integer value program won...,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [40]:
df.to_csv(f"{data_path}data.csv", header=True, index=False)

# Model Build

In [41]:
df = pd.read_csv(f"{data_path}data.csv", header=0)

disp(df)

(150000, 4016)


Unnamed: 0,id,title,body,tag,title_proc,body_proc,X_body_0,X_body_1,X_body_2,X_body_3,...,c,c#,c++,html,java,javascript,php,python,r,sql
0,25600370,Data Validation in C when the user inputs an i...,<p>How do I ensure that the user inputs only a...,c,data validation c user inputs incorrect value ...,p ensure user inputs integer value program won...,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,35082100,c malloc functionality for custom memory region,<p>Is there any malloc/realloc/free like imple...,c,c malloc functionality custom memory region,p malloc realloc free like implementation spec...,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,18968070,Error when defining a stringising macro with _...,<p>I have been trying to implement a function ...,c,error defining stringising macro va args,p trying implement function macro c prepends d...,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,24130610,"Valgrind ""Conditional jump or move depends on ...",<p>I'm very new to C programming (my first que...,c,valgrind conditional jump move depends uniniti...,p new c programming first question stackoverfl...,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,26009570,What improvements does GCC's `__builtin_malloc...,<p>I have recently been made aware of GCC's bu...,c,improvements gcc builtin malloc provide plain ...,p recently made aware gcc built functions c li...,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [42]:
df["tag"].value_counts()

c             15000
c#            15000
c++           15000
html          15000
java          15000
javascript    15000
php           15000
python        15000
r             15000
sql           15000
Name: tag, dtype: int64

In [44]:
x_cols = [col for col in df.columns if "X_" in col]
y_cols = list(df["tag"].unique())
X_train, X_test, Y_train, Y_test = train_test_split(df[x_cols], df[y_cols], test_size=0.20, random_state=314)
del df

tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test).toarray()

In [45]:
joblib.dump(tfidf, f"{data_path}tfidf.pkl")
joblib.dump(matrix_title, f"{data_path}matrix_title.pkl")
joblib.dump(matrix_body, f"{data_path}matrix_body.pkl")

['/content/drive/MyDrive/data/final_project/matrix_body.pkl']

In [46]:
model = Sequential()

model.add(Dense(units=4000,activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(units=1000,activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(units=100,activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(units=10, activation='softmax'))

opt=tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [47]:
model.fit(X_train, Y_train, epochs=10, batch_size=256, validation_data=(X_test, Y_test), callbacks=early_stop)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


<keras.callbacks.History at 0x7fe2cdc7cfd0>

In [48]:
preds_df = pd.DataFrame(model.predict(X_test), columns=Y_test.columns)
preds_df["tag"] = preds_df.apply(lambda x: preds_df.columns[x.argmax()], axis=1)
Y_test_df = copy.copy(Y_test)
Y_test_df["tag"] = Y_test_df.apply(lambda x: Y_test_df.columns[x.argmax()], axis=1)
cr = classification_report(Y_test_df["tag"], preds_df["tag"])

print(cr)

              precision    recall  f1-score   support

           c       0.79      0.83      0.81      3002
          c#       0.83      0.74      0.78      2958
         c++       0.78      0.77      0.78      3003
        html       0.86      0.80      0.83      2959
        java       0.81      0.88      0.85      3022
  javascript       0.81      0.82      0.81      2994
         php       0.90      0.87      0.88      3073
      python       0.91      0.90      0.91      2949
           r       0.88      0.91      0.90      2996
         sql       0.87      0.92      0.90      3044

    accuracy                           0.84     30000
   macro avg       0.84      0.84      0.84     30000
weighted avg       0.84      0.84      0.84     30000



In [49]:
model.save(f"{data_path}model")

INFO:tensorflow:Assets written to: /content/drive/MyDrive/data/final_project/model/assets
