In [16]:
# Importing Sqlite3 Module
import sqlite3
import pandas as pd

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn import metrics

## Task 1

In [26]:
try:

    # Making a connection between sqlite3
    # database and Python Program
    sqliteConnection = sqlite3.connect('database.db')

    # If sqlite3 makes a connection with python
    # program then it will print "Connected to SQLite"
    # Otherwise it will show errors
    print("Connected to SQLite")

    # Getting all tables from sqlite_master
    sql_query = """SELECT post_id,
                text,
                topic,
                forum_id,
                case
                    when (birth_year % 10) in (0, 1) then 'test'
                    else 'train'
                end as partition
            from posts
            left join subforums using (forum_id)
            left join users using (user_id)"""

    # Creating cursor object using connection object
    cursor = sqliteConnection.cursor()

    # executing our sql query
    cursor.execute(sql_query)
    print("List of tables\n")
    
    #Create dataframe from SQL table
    complete_data = pd.DataFrame(cursor.fetchall(), columns = ['id', 'text', 'forum', 'forum_id', 'partition'])
    print (complete_data.head())

except sqlite3.Error as error:
    print("Failed to execute the above query", error)

finally:

    # Inside Finally Block, If connection is
    # open, we need to close it
    if sqliteConnection:

        # using close() method, we will close
        # the connection
        sqliteConnection.close()

        # After closing connection object, we
        # will print "the sqlite connection is
        # closed"
        print("the sqlite connection is closed")


Connected to SQLite
List of tables

   id                                               text            forum  \
0   0  From article <1993Apr22.233001.13436@vax.oxfor...          sci.med   
1   1  \nIf the tire has a leak you should fix it. \n...        rec.autos   
2   2  \n\nOkay Mr. Dyer, we're properly impressed wi...          sci.med   
3   3  I just bought a little gizmo that is supposed ...  sci.electronics   
4   4  molecules has evolved from an esoteric academi...    comp.graphics   

   forum_id partition  
0         3     train  
1         1      test  
2         3     train  
3         2     train  
4         0     train  
the sqlite connection is closed


In [32]:
complete_data.shape

(4924, 5)

In [33]:
complete_data.isna().sum()

id           0
text         0
forum        0
forum_id     0
partition    0
dtype: int64

## Task 2

In [28]:
train_df = complete_data[complete_data['partition']=='train'] 

In [29]:
train_df.head()

Unnamed: 0,id,text,forum,forum_id,partition
0,0,From article <1993Apr22.233001.13436@vax.oxfor...,sci.med,3,train
2,2,"\n\nOkay Mr. Dyer, we're properly impressed wi...",sci.med,3,train
3,3,I just bought a little gizmo that is supposed ...,sci.electronics,2,train
4,4,molecules has evolved from an esoteric academi...,comp.graphics,0,train
5,5,Anybody know where I can get Graphics Work Shop?,comp.graphics,0,train


In [246]:
#Separating text and labels
train_x = train_df['text']
train_y = train_df['forum_id']

In [247]:
#Separating the data into validation dataset. For sake of interpretability we will call this as Validation dataset instead
# of test
val_df = complete_data[complete_data['partition']=='test'] 

In [248]:
val_df.head()

Unnamed: 0,id,text,forum,forum_id,partition
1,1,\nIf the tire has a leak you should fix it. \n...,rec.autos,1,test
7,7,"\n As for advertising -- sure, why not? A N...",sci.space,4,test
20,20,Does anyone have any Russian Contacts (Space o...,sci.space,4,test
22,22,"Sorry, _perijoves_...I'm not used to talking t...",sci.space,4,test
24,24,"\n\tI am puzzled by the term ""concept."" Drag f...",sci.space,4,test


In [249]:
#Separating text and labels
val_x = val_df['text']
val_y = val_df['forum_id']

In [250]:
# Initialize Tf-Idf Vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')

In [251]:
#Vectorize text and extract features
print("Extracting features from the train data using the same vectorizer")
X_train = vectorizer.fit_transform(train_x)
print("Extracting features from the validation data using the same vectorizer")
X_val = vectorizer.transform(val_x)

Extracting features from the train data using the same vectorizer
Extracting features from the validation data using the same vectorizer


In [253]:
#Linear Support Vector for Classification
clf = LinearSVC()

In [254]:
clf.fit(X_train, train_y)

LinearSVC()

## Task 3

In [258]:
pred = clf.predict(X_val)
score = metrics.accuracy_score(val_y, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.887


In [259]:
target_names = ['0','1','2','3','4']

In [260]:
print("classification report:")
print(metrics.classification_report(val_y, pred, target_names=target_names))

classification report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91       180
           1       0.82      0.91      0.86       199
           2       0.87      0.86      0.86       210
           3       0.94      0.90      0.92       213
           4       0.91      0.87      0.89       195

    accuracy                           0.89       997
   macro avg       0.89      0.89      0.89       997
weighted avg       0.89      0.89      0.89       997



## Task 4

I noticed that the json data has some unexpected data which was causing error while loading the data directly in the pandas dataframe. Hence I had to clean the data before using it further.

In [237]:
import json
import ast
ast.literal_eval
with open('new_posts.json') as datafile:
    data = datafile.read()

In [238]:
row_list = [row for row in data.split('\n') if row != '']

In [261]:
#Unexpected text between the json data on row 2001
row_list[2001]

'exit(1)'

In [241]:
row_dict_list = []
for index,row in enumerate(row_list):
    try:
        row_dict_list.append(json.loads(row))
    except:
        pass

In [242]:
len(row_dict_list)

4924

In [243]:
#Converting the json data to pandas dataframe
test_df = pd.DataFrame.from_dict(row_dict_list)

In [244]:
test_df

Unnamed: 0,post_id,topic,text,forum_id
0,4923,sci.med,From article <1993Apr22.233001.13436@vax.oxfor...,3
1,4924,sci.electronics,\nIf the tire has a leak you should fix it. \n...,2
2,4925,sci.med,"\n\nOkay Mr. Dyer, we're properly impressed wi...",3
3,4926,sci.electronics,I just bought a little gizmo that is supposed ...,2
4,4927,comp.graphics,molecules has evolved from an esoteric academi...,0
...,...,...,...,...
4919,9842,sci.space,\n\nI was suggesting that the minority of prof...,4
4920,9843,sci.med,"% mail newsserv@kiae.su\nSubject: PLEASE, HEL...",3
4921,9844,comp.graphics,\nMine was beautiful for a year and a half. T...,0
4922,9845,sci.med,\n\nAs a child i can remember picking up a cen...,3


In [263]:
#Vectorize text and extract features
test_x = test_df['text']
print("Extracting features from the test data using the same vectorizer")
X_test = vectorizer.transform(test_x)


Extracting features from the test data using the same vectorizer


In [264]:
#Predict on test data
final_pred = clf.predict(X_test)

In [265]:
final_pred

array([3, 1, 3, ..., 0, 3, 4], dtype=int64)

In [267]:
test_df['predicted_forum_id'] = final_pred

In [268]:
def check_forum(x):
    if x['forum_id']!= x['predicted_forum_id']:
        return x['predicted_forum_id']
    else:
        return None
    

In [269]:
test_df['reassign'] = test_df.apply(check_forum, axis=1)

In [270]:
test_df

Unnamed: 0,post_id,topic,text,forum_id,predicted_forum_id,reassign
0,4923,sci.med,From article <1993Apr22.233001.13436@vax.oxfor...,3,3,
1,4924,sci.electronics,\nIf the tire has a leak you should fix it. \n...,2,1,1.0
2,4925,sci.med,"\n\nOkay Mr. Dyer, we're properly impressed wi...",3,3,
3,4926,sci.electronics,I just bought a little gizmo that is supposed ...,2,2,
4,4927,comp.graphics,molecules has evolved from an esoteric academi...,0,0,
...,...,...,...,...,...,...
4919,9842,sci.space,\n\nI was suggesting that the minority of prof...,4,4,
4920,9843,sci.med,"% mail newsserv@kiae.su\nSubject: PLEASE, HEL...",3,3,
4921,9844,comp.graphics,\nMine was beautiful for a year and a half. T...,0,0,
4922,9845,sci.med,\n\nAs a child i can remember picking up a cen...,3,3,


In [233]:
test_df.dtypes

post_id                 int64
topic                  object
text                   object
forum_id                int64
predicted_forum_id      int64
reassign              float64
dtype: object

In [271]:
test_df.to_csv('result.csv')