In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [115]:
root_path = '/content/drive/MyDrive/ColabNotebooks/zindi/GenderBaseViolence/'

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
data = pd.read_csv(root_path + 'Train.csv')

In [119]:
test = pd.read_csv(root_path + 'Test.csv')

In [None]:
data.head()

In [6]:
data.shape

(39650, 3)

In [None]:
data['type'].unique()

In [None]:
data['type'].nunique()

Checking for null values.

In [None]:
pd.isnull(data['type']).sum()

No null values are present.

In [None]:
data['tweet'].apply(lambda x: len(x.split(' '))).sum()

We have more than 10 million words in the data.

Distribution of classes.

In [None]:
plt.figure(figsize=(17,6))
sns.countplot(data['type'])

From the above visualisation we can see classes are very well balanced.

In [15]:
def post_tag(index):
    post = data[data.index==index].values[0]
    print(post[0])
    print(post[1])
    print('Type:', post[2])

Looking few posts and tags.

In [None]:
post_tag(5)

In [None]:
post_tag(11)

In [None]:
post_tag(10000)

The texts needs to be cleaned up.

In [None]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [24]:
def clean_text(text):
    text = BeautifulSoup(text,'lxml').text
    text = text.lower()
    text = re.sub('[/(){}\[\]\|@,;]', " ",text)
    text = re.sub('[^0-9a-z #+_]'," ",text)
    text = ' '.join(word for word in text.split() if word not in set(stopwords.words('english')))
    return text

In [25]:
data['tweet'] = data['tweet'].apply(clean_text)

In [None]:
post_tag(5)

In [None]:
post_tag(11)

In [None]:
post_tag(10000)

Much better.

In [None]:
data['tweet'].apply(lambda x: len(x.split(" "))).sum()

Now, we have over 3 million words to work with.

Splitting the dataset.

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X = data['tweet']
y = data['type']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [33]:
X_train.shape

(31720,)

In [34]:
X_test.shape

(7930,)

In [35]:
X_test.head()

26498    wrestled ground tased knife said going car get...
16353    funny story uncle told cause working children ...
27711    remember also victim rapist raped attempted ra...
32953    honestly could scream find one woman sexually ...
31977    many statues history books depict thomas jeffe...
Name: tweet, dtype: object

The next steps includes feature engineering and model building using pipeline.

## Naive Bayes classifier for multinomial models.

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [117]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [38]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [39]:
y_pred = nb.predict(X_test)

In [42]:
print(classification_report(y_pred, y_test,target_names=data['type'].unique()))
print(accuracy_score(y_pred,y_test))

                              precision    recall  f1-score   support

             sexual_violence       0.00      0.00      0.00         0
           Physical_violence       0.51      1.00      0.68       603
          emotional_violence       0.00      0.00      0.00         0
Harmful_Traditional_practice       0.04      1.00      0.07         5
           economic_violence       1.00      0.89      0.94      7322

                    accuracy                           0.90      7930
                   macro avg       0.31      0.58      0.34      7930
                weighted avg       0.96      0.90      0.92      7930

0.901765447667087


  _warn_prf(average, modifier, msg_start, len(result))


# Linear support vector machine.

In [43]:
from sklearn.linear_model import SGDClassifier

In [44]:
svm = Pipeline([('vect',CountVectorizer()),
               ('tfidf',TfidfTransformer()),
               ('svm_clf', SGDClassifier(loss='hinge', penalty='l2',alpha=0.001,n_jobs=-1,random_state=0,max_iter=5))])
svm.fit(X_train,y_train)             



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('svm_clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                  

In [46]:
%%time
y_pred_svm = svm.predict(X_test)
print(accuracy_score(y_test,y_pred_svm))
print(classification_report(y_test,y_pred_svm, target_names=data['type'].unique()))

0.9751576292559899
                              precision    recall  f1-score   support

             sexual_violence       0.00      0.00      0.00        33
           Physical_violence       0.99      0.99      0.99      1179
          emotional_violence       0.00      0.00      0.00        45
Harmful_Traditional_practice       1.00      0.22      0.36       130
           economic_violence       0.97      1.00      0.99      6543

                    accuracy                           0.98      7930
                   macro avg       0.59      0.44      0.47      7930
                weighted avg       0.97      0.98      0.97      7930

CPU times: user 380 ms, sys: 9.11 ms, total: 389 ms
Wall time: 395 ms


  _warn_prf(average, modifier, msg_start, len(result))


# Logistic regression.

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
lr = Pipeline([('vect',CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('lr_clf', LogisticRegression(n_jobs=-1,C=0.01))])

In [49]:
lr.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('lr_clf',
                 LogisticRegression(C=0.01, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1

In [50]:
y_pred_lr = lr.predict(X_test)

In [51]:
print(accuracy_score(y_pred_lr,y_test))
print(classification_report(y_pred_lr,y_test,target_names= data['type'].unique()))

0.9044136191677176
                              precision    recall  f1-score   support

             sexual_violence       0.00      0.00      0.00         0
           Physical_violence       0.53      1.00      0.70       629
          emotional_violence       0.00      0.00      0.00         0
Harmful_Traditional_practice       0.00      0.00      0.00         0
           economic_violence       1.00      0.90      0.95      7301

                    accuracy                           0.90      7930
                   macro avg       0.31      0.38      0.33      7930
                weighted avg       0.96      0.90      0.93      7930



  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest.

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
rfc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('rfc', RandomForestClassifier(n_estimators=200, criterion='entropy'))])
rfc.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='entropy',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                             

In [54]:
y_pred_rfc = rfc.predict(X_test)

In [55]:
print(accuracy_score(y_pred_rfc,y_test))
print(classification_report(y_pred_rfc,y_test))

0.9962168978562421
                              precision    recall  f1-score   support

Harmful_Traditional_practice       0.70      1.00      0.82        23
           Physical_violence       1.00      1.00      1.00      1178
           economic_violence       0.78      1.00      0.88        35
          emotional_violence       0.95      1.00      0.98       124
             sexual_violence       1.00      1.00      1.00      6570

                    accuracy                           1.00      7930
                   macro avg       0.89      1.00      0.93      7930
                weighted avg       1.00      1.00      1.00      7930



# Creating bag of words with Keras.

In [56]:
from tensorflow import keras

In [78]:
from keras import utils as np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing import text, sequence

In [79]:
train_size = int(len(data) * 0.7)

In [80]:
print('Train size: %d' %train_size)
print('Test_size: %d' %(len(data)-train_size))

Train size: 27755
Test_size: 11895


In [81]:
train_post = data['tweet'][:train_size]
train_tag = data['type'][:train_size]
test_post = data['tweet'][train_size:]
test_tag = data['type'][train_size:]

In [82]:
train_post.head()

0    dream got raped last night guy work actually g...
1    thought word raped means sex told saw dogs rap...
2    talking raped 2 men 1 molested jail nother cha...
3    sexually abused 3 years age 4 7 one believed r...
4    chessy prout better telling truth selling owen...
Name: tweet, dtype: object

In [83]:
train_tag.head()

0    sexual_violence
1    sexual_violence
2    sexual_violence
3    sexual_violence
4    sexual_violence
Name: type, dtype: object

In [84]:
test_post.head()

27755    used real name found one followers convicted s...
27756    fuck love care lot asked okay say said yes eve...
27757                               raped asking nuff said
27758    report dcp zone 9 multiple threats kidnapping ...
27759    one detained woman compared visit gynecologist...
Name: tweet, dtype: object

In [85]:
test_tag.head()

27755    sexual_violence
27756    sexual_violence
27757    sexual_violence
27758    sexual_violence
27759    sexual_violence
Name: type, dtype: object

In [86]:
token = text.Tokenizer(num_words=1000,char_level=False)

In [87]:
token.fit_on_texts(train_post)

In [88]:
x_train = token.texts_to_matrix(train_post)
x_test = token.texts_to_matrix(test_post)

In [89]:
print(x_train)
print(x_train.shape)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
(27755, 1000)


In [90]:
print(x_test)
print(x_test.shape)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
(11895, 1000)


In [91]:
from sklearn.preprocessing import LabelEncoder

In [92]:
encoder = LabelEncoder()

In [93]:
Y_train = encoder.fit_transform(train_tag)
Y_test = encoder.transform(test_tag)

In [94]:
print(Y_train)
print(Y_train.shape)

[4 4 4 ... 4 4 4]
(27755,)


In [95]:
print(Y_test)
print(Y_test.shape)

[4 4 4 ... 1 4 4]
(11895,)


In [96]:
num_classes = np.max(Y_train) + 1

In [106]:
Y_train = keras.utils.to_categorical(Y_train, num_classes)

In [107]:
Y_test = keras.utils.to_categorical(Y_test, num_classes)

In [108]:
print(Y_test)
print(Y_test.shape)
print(Y_train)
print(Y_train.shape)

[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]
(11895, 5)
[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]
(27755, 5)


Building model.

In [109]:
model = Sequential()
model.add(Dense(512, input_shape=(1000,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [111]:
%%time
model.fit(x_train,Y_train, batch_size=32, epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
CPU times: user 1min 21s, sys: 6.4 s, total: 1min 28s
Wall time: 56.4 s


<keras.callbacks.History at 0x7fad86011490>

In [112]:
score = model.evaluate(x_test, Y_test,
                       batch_size=32, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.9991593360900879


# **Submission**

In [118]:
sample_submission = pd.read_csv(root_path + "SampleSubmission.csv")

In [137]:
##let's prdict on the test data
test_predict = model.predict(token.texts_to_matrix(test))

In [138]:
test_predict

array([[8.5330056e-04, 1.4195814e-04, 1.7853439e-04, 1.8230316e-05,
        9.9880803e-01],
       [8.5330056e-04, 1.4195814e-04, 1.7853439e-04, 1.8230316e-05,
        9.9880803e-01]], dtype=float32)

In [139]:
np.unique(test_predict)

array([1.8230316e-05, 1.4195814e-04, 1.7853439e-04, 8.5330056e-04,
       9.9880803e-01], dtype=float32)

In [140]:
sample_submission["type"] = test_predict

ValueError: ignored

In [135]:
sample_submission.to_csv(root_path + 'my_work.csv', index=False)