In [0]:
import pandas as pd
import numpy as np

In [0]:
trainData = pd.read_csv('/content/drive/My Drive/quora_dataset/train.csv')
testData = pd.read_csv('/content/drive/My Drive/quora_dataset/test.csv')

We have to use classification algorithms as we have target column in the rangee of 0 and 1 as values. So, to make this simplified we are using classification approach.

In [0]:
trainData.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [0]:
testData.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [0]:
# Checking for null values
trainData.isna().sum()

qid              0
question_text    0
target           0
dtype: int64

In [0]:
testData.isna().sum()

qid              0
question_text    0
dtype: int64

Target column is not available in test data so we have to predict target columnn for test data. We are comparing different classification models and **will use best model** to predict for test data.

In [0]:
# Size of dataset
print(f'Training data has {trainData.shape[0]} rows and {trainData.shape[1]} columns.')

Training data has 1306122 rows and 3 columns.


In [0]:
print(f'Test data has {testData.shape[0]} rows and {testData.shape[1]} columns.')

Test data has 375806 rows and 2 columns.


List of tasks needs to be done before doing any predictions. Here we are doing text pre-processing tasks such as:
*   Removing stop words
*   Converting all words into tokens and making all of them in lower case format
*   Lemmatizing all the words so it can be used as it's root form
*   We are lemmatizing it to reduce complexity and scope of our dataset





In [0]:
# We are importing spacy library for stop words list
from spacy.lang.en.stop_words import STOP_WORDS

In [0]:
print(f"There are {len(STOP_WORDS)} stop words.")

There are 326 stop words.


*   Stop words aree words in english language which does not convey any meaning in the string. Example are like a, an, the and many more.



In [0]:
import spacy.cli
# We are downloading library which is english core and which is very big library 
# and needs to be accessed in order to process our data.
spacy.cli.download('en_core_web_lg')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
import en_core_web_lg

In [0]:
import string
punct = string.punctuation
stopWords = list(STOP_WORDS)
parser = en_core_web_lg.load(disable=['tagger', 'ner'])
def spacy_tokenizer(text):
  sentence = parser(text)
  tokenized = [word.lemma_.lower().strip() for word in sentence]
  tokenized = [word for word in tokenized if word not in stopWords and word not in punct]
  tokenized = ' '.join([i for i in tokenized])
  return tokenized

In [0]:
from tqdm import tqdm

In [0]:
tqdm.pandas()

In [0]:
trainData['processed_text'] = trainData['question_text'].progress_apply(spacy_tokenizer)

100%|██████████| 1306122/1306122 [1:23:59<00:00, 259.16it/s]


In [0]:
testData['processed_text'] = testData['question_text'].progress_apply(spacy_tokenizer)

100%|██████████| 375806/375806 [24:12<00:00, 258.71it/s]


In [0]:
import pandas as pd
trainData = pd.read_csv('/content/drive/My Drive/trainProcessed.csv')
testData = pd.read_csv('/content/drive/My Drive/testProcessed.csv')

In [0]:
trainData.head()

Unnamed: 0.1,Unnamed: 0,qid,question_text,target,processed_text
0,0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,quebec nationalist province nation 1960s
1,1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,adopt dog encourage people adopt shop
2,2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,doe velocity affect time velocity affect space...
3,3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,otto von guericke use magdeburg hemisphere
4,4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,convert montra helicon d mountain bike change ...


In [0]:
testData.head()

Unnamed: 0.1,Unnamed: 0,qid,question_text,processed_text
0,0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...,woman rude arrogant little bite wealth power
1,1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...,apply rv college engineer bms college engineer...
2,2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...,like nurse practitioner
3,3,000086e4b7e1c7146103,Who are entrepreneurs?,entrepreneur
4,4,0000c4c3fbe8785a3090,Is education really making good people nowadays?,education good people nowadays


In [0]:
trainData = trainData[['qid', 'question_text', 'processed_text', 'target']]
testData = testData[['qid', 'question_text', 'processed_text']]

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
def vectorize(text, maxx_features):
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X

In [0]:
# We reduced max features to 2 ** 8 because of computing issues.
text = trainData['processed_text'].values.astype("U")
X = vectorize(text, 2 ** 8)
X.shape

(1306122, 256)

In [0]:
trainFeatures = pd.DataFrame(X.toarray())

In [0]:
trainFeatures['target'] = trainData['target']

In [0]:
from sklearn.linear_model import LogisticRegression

In [0]:
def classification_report(model_name, data, predicted):
  from sklearn.metrics import precision_score, recall_score
  from sklearn.metrics import accuracy_score
  from sklearn.metrics import f1_score
  print(model_name, '\n')
  print("     Precision: ", '{:,.3f}'.format(float(precision_score(data, predicted, average='macro')) * 100), "%")
  print("        Recall: ", '{:,.3f}'.format(float(recall_score(data, predicted, average='macro')) * 100), "%")
  print("      F1 score: ", '{:,.3f}'.format(float(f1_score(data, predicted, average='macro')) * 100), "%")
  print("Accuracy score: ", '{:,.3f}'.format(float(accuracy_score(data, predicted)) * 100), "%")

In [0]:
classifier = LogisticRegression(max_iter=300)

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(trainFeatures.iloc[:,:-1], trainFeatures.iloc[:, -1:], test_size=0.33, random_state=42)

In [0]:
classifier.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
classifier.score(X_test, Y_test)

0.9415921730031716

In [0]:
classifier.fit(trainFeatures.iloc[:,:-1], trainFeatures.iloc[:, -1:])
logistiicPredicted = classifier.predict(trainFeatures.iloc[:,:-1])

  y = column_or_1d(y, warn=True)


In [0]:
classification_report('Logistic Regression', trainFeatures.iloc[:, -1:], logistiicPredicted)

Logistic Regression 

     Precision:  76.991 %
        Recall:  56.448 %
      F1 score:  59.464 %
Accuracy score:  94.078 %


In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
dTC = DecisionTreeClassifier()

In [0]:
dTC.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [0]:
dTC.score(X_test, Y_test)

0.9339591342417191

In [0]:
# Depth of trees
dTC.get_depth()

624

In [0]:
dTCPredicted = dTC.predict(trainFeatures.iloc[:, :-1])

In [0]:
classification_report('Decision Tree Classifier', trainFeatures.iloc[:, -1:], dTCPredicted)

Decision Tree Classifier 

     Precision:  86.900 %
        Recall:  69.571 %
      F1 score:  75.194 %
Accuracy score:  95.571 %


In [0]:
from sklearn.linear_model import SGDClassifier

In [0]:
SGDC = SGDClassifier(max_iter=1000)

In [0]:
SGDC.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [0]:
SGDC.score(X_test, Y_test)

0.9398312379211222

In [0]:
# We are making sure of other metrics also to see weather Logistic is really good or not!

In [0]:
SGDPredicted = SGDC.predict(trainFeatures.iloc[:, :-1])

In [0]:
classification_report('SGD Classifier', trainFeatures.iloc[:, -1:], SGDPredicted)

SGD Classifier 

     Precision:  76.151 %
        Recall:  52.330 %
      F1 score:  52.934 %
Accuracy score:  93.898 %


In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
RFC = RandomForestClassifier()

In [0]:
RFC.fit(X_train, Y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
RFC.score(X_test, Y_test)

0.9409216720299011

In [0]:
RFCPredicted = RFC.predict(trainFeatures.iloc[:, :-1])

In [0]:
classification_report('RFC Classifier', trainFeatures.iloc[:, -1:], RFCPredicted)

In [0]:
RFC1 = RandomForestClassifier(criterion='entropy')

In [0]:
RFC1.fit(X_train, Y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
RFC1.score(X_test, Y_test)

0.9409170318847574

In [0]:
RFC1Predicted = RFC1.predict(trainFeatures.iloc[:, :-1])

In [0]:
classification_report('RFC1 Classifier', trainFeatures.iloc[:, -1:], RFC1Predicted)

RFC1 Classifier 

     Precision:  89.915 %
        Recall:  69.693 %
      F1 score:  75.915 %
Accuracy score:  95.799 %
