In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import gensim

from gensim.models import Word2Vec, KeyedVectors

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re
import matplotlib.pyplot as plt
import nltk

from tensorflow.keras.utils import to_categorical




In [2]:
data = pd.read_csv("data/news_class.csv")

df = data.copy() # to be safe and avoid errors

df = df.loc[:,["data_id" , "content" , "category_level_1" , "category_level_2"]]
df.groupby(['category_level_1'])['data_id'].agg('count')

category_level_1
arts, culture, entertainment and media        300
conflict, war and peace                       800
crime, law and justice                        500
disaster, accident and emergency incident     500
economy, business and finance                 400
education                                     607
environment                                   600
health                                        700
human interest                                600
labour                                        703
lifestyle and leisure                         300
politics                                      900
religion and belief                           800
science and technology                        800
society                                      1100
sport                                         907
weather                                       400
Name: data_id, dtype: int64

In [3]:
def labeler(dataframe_column):
    encoder=LabelEncoder()
    
    labels = encoder.fit_transform(dataframe_column)
    print(encoder.classes_)
    print("We did it boys , labels have been created")
    
    return(pd.DataFrame(labels))

In [4]:
# Function for removing ASCII characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting to lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words

def remove_stop_words(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text =  tokenizer.tokenize(text)
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text


# Function for removing html
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

def lemm_text(text):
    lemm=WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    tokens = text
    return ' '.join([lemm.lemmatize(t) for t in tokens])


def remove_digits(text):
    text = re.sub(r'\d', '', text)
    #text = " ".join(text)
    return text

In [5]:
df['Cleaned'] = df['content'].apply(_removeNonAscii)
df['Cleaned'] = df.Cleaned.apply(func = make_lower_case)
df['Cleaned'] = df.Cleaned.apply(func = remove_stop_words)
df['Cleaned'] = df.Cleaned.apply(func = remove_punctuation)
df['Cleaned'] = df.Cleaned.apply(func = remove_html)
df['Cleaned'] = df.Cleaned.apply(func = lemm_text)
df['Cleaned'] = df.Cleaned.apply(func = remove_digits)
df = df.drop('content' , axis =1)

In [6]:
df

Unnamed: 0,data_id,category_level_1,category_level_2,Cleaned
0,1809,"crime, law and justice",crime,virginia woman whose year old son found trash...
1,1980,"crime, law and justice",crime,authority trying determine anyone helped two i...
2,1995,"crime, law and justice",crime,year old suspect double homicide escaped cust...
3,2740,"crime, law and justice",crime,mother two young child found hanging pennsylva...
4,7038,"crime, law and justice",crime,one family member said derek violent attacked ...
...,...,...,...,...
10912,907640,"conflict, war and peace",post-war reconstruction,post originally published site beirut lebanon ...
10913,892720,"conflict, war and peace",post-war reconstruction,post originally published site kiev october t...
10914,870499,"conflict, war and peace",post-war reconstruction,post http www presstv ir detail iran suppo...
10915,887334,"conflict, war and peace",post-war reconstruction,post http www presstv ir detail iraq salih...


In [7]:
io = df.copy()

io = io.drop([374]).reset_index(drop=True)
io = io.drop([6527]).reset_index(drop=True)

corpus_full = []
for words in io['Cleaned']:
    corpus_full.append(words.split()) 

In [8]:
def vectorizer():
    
    
    # Creating a list for storing the vectors ('Description' into vectors)
    #global word_embeddings
    word_embeddings = []
    i = 0
    # Reading the each 'Description'
    for line in io['Cleaned']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in model.wv.key_to_index:
                count += 1
                if avgword2vec is None:
                    avgword2vec = model.wv[word]
                else:
                    avgword2vec = avgword2vec + model.wv[word]
                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
            word_embeddings.append(avgword2vec)
        else:
            print("I found it , the error occurs at line:" , i)
          
        i +=1 

    return(pd.DataFrame(word_embeddings))  # Returning our Data as a Dataframe (aesthetic reasons)

In [9]:
#model = Word2Vec(sentences=corpus_train_full, vector_size=200, window=4, min_count=2, sg = 1 , hs = 1)  # skipgram architecture



model = Word2Vec.load("model_no_numbers.h3")

In [10]:
vocab_len = len(model.wv)

vocab_len

57876

In [11]:
vect = vectorizer()

In [12]:
y1 = labeler(io["category_level_1"])


X = vect.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2,random_state=42)

y_train1 = to_categorical(y_train)

['arts, culture, entertainment and media' 'conflict, war and peace'
 'crime, law and justice' 'disaster, accident and emergency incident'
 'economy, business and finance' 'education' 'environment' 'health'
 'human interest' 'labour' 'lifestyle and leisure' 'politics'
 'religion and belief' 'science and technology' 'society' 'sport'
 'weather']
We did it boys , labels have been created


In [19]:
y_test1 = to_categorical(y_test)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Flatten , Input , Conv2D , MaxPooling2D , BatchNormalization

In [14]:
model_3 = Sequential()
model_3.add(Dense(200,input_dim=200,activation = "leaky_relu"))
model_3.add(Dense(360,activation = "leaky_relu"))
model_3.add(tf.keras.layers.Dropout(0.2))
#model_3.add(Dense(60,activation = "leaky_relu"))
#model6.add(Dense(30,activation = "leaky_relu"))
model_3.add(Dense(17,activation = "softmax"))




In [15]:
opt = tf.keras.optimizers.AdamW(learning_rate =0.007 , beta_1=0.9,
    beta_2=0.999,
    use_ema=True,
    ema_momentum=0.99)

In [16]:
model_3.compile(optimizer = opt , 
              loss = 'categorical_crossentropy' ,
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

In [21]:
model_3.fit(X_train , y_train1 , batch_size = 16 , epochs = 4 , validation_split=(0.2) ,verbose =1)


predictions = np.argmax(model_3.predict(X_test), axis=-1)

print(predictions)


print(classification_report(y_test.values , predictions))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[14  1  3 ... 11  9 12]
              precision    recall  f1-score   support

           0       0.61      0.51      0.56        68
           1       0.79      0.75      0.77       176
           2       0.74      0.71      0.73        97
           3       0.64      0.73      0.68        90
           4       0.74      0.53      0.62        93
           5       0.69      0.77      0.73       108
           6       0.78      0.83      0.80       126
           7       0.75      0.71      0.73       136
           8       0.77      0.48      0.59       122
           9       0.83      0.59      0.69       155
          10       0.66      0.72      0.69        61
          11       0.56      0.69      0.62       172
          12       0.72      0.79      0.75       182
          13       0.67      0.60      0.63       151
          14       0.51      0.65      0.57       200
          15       0.80      0.89      0.85       169
          16     

In [22]:
#accuracy = cnn_model.evaluate(X_test, caty_test, verbose=False)

scores = model_3.evaluate(X_test, y_test1)

for i, m in enumerate(model_3.metrics_names):
    print("\n%s: %.3f"% (m, scores[i]))


#print("Testing Accuracy:  {:.4f}".format(accuracy[1]))


loss: 0.991

accuracy: 0.966

precision: 0.805

recall: 0.557


In [52]:
"""
1 - > 68 , 2 -> 69 , 3 -> 70 , 4 - > 71 , 7 - > 72 , 25, 



8 -> 74
16 -> 75





"""

'\n1 - > 68 , 2 -> 69 , 3 -> 70 , 4 - > 71 , 7 - > 72 , 25, \n\n\n\n8 -> 74\n16 -> 75\n\n\n\n\n\n'

In [62]:
model_3.fit(X_train , y_train1 , batch_size = 16 , epochs = 25 , validation_split=(0.2) ,verbose =1)


predictions = np.argmax(model_3.predict(X_test), axis=-1)

print(predictions)


print(classification_report(y_test.values , predictions))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
[14  1 16 ... 11  9 12]
              precision    recall  f1-score   support

           0       0.74      0.62      0.67        68
           1       0.78      0.78      0.78       176
           2       0.81      0.69      0.74        97
           3       0.72      0.64      0.68        90
           4       0.75      0.63      0.69        93
           5       0.72      0.82      0.77       108
           6       0.83      0.79      0.81       126
           7       0.72      0.81      0.76       136
           8       0.74      0.66      0.70       122
           9       0.83      0.74      0.78       155
          10       0.78      0.66      0.71        61
          11       0.62      0.66   

# Model 4

In [76]:
model_4 = Sequential()
model_4.add(Dense(150,input_dim=200,activation = "leaky_relu"))
model_4.add(Dense(270,activation = "leaky_relu"))
model_4.add(tf.keras.layers.Dropout(0.2))
#model_4.add(Dense(60,activation = "leaky_relu"))
#model6.add(Dense(30,activation = "leaky_relu"))
model_4.add(Dense(17,activation = "softmax"))

In [77]:
opt = tf.keras.optimizers.AdamW(learning_rate =0.007 , beta_1=0.9,
    beta_2=0.999,
    use_ema=True,
    ema_momentum=0.99)

In [78]:
model_4.compile(optimizer = opt , 
              loss = 'categorical_crossentropy' ,
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

In [79]:
model_4.fit(X_train , y_train1 , batch_size = 16 , epochs = 25 , validation_split=(0.2) ,verbose =1)


predictions = np.argmax(model_4.predict(X_test), axis=-1)

print(predictions)


print(classification_report(y_test.values , predictions))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
[14  1 16 ... 11  9 12]
              precision    recall  f1-score   support

           0       0.73      0.60      0.66        68
           1       0.81      0.78      0.79       176
           2       0.72      0.76      0.74        97
           3       0.69      0.66      0.67        90
           4       0.83      0.56      0.67        93
           5       0.70      0.74      0.72       108
           6       0.82      0.87      0.85       126
           7       0.76      0.76      0.76       136
           8       0.74      0.65      0.69       122
           9       0.83      0.72      0.77       155
          10       0.77      0.79      0.78        61
          11       0.63      0.66   

In [32]:
model = Word2Vec(sentences=corpus_full, vector_size=200, window=4, min_count=10, sg = 1 , hs = 1)  # skipgram architecture

In [33]:
vect = vectorizer()

y1 = labeler(io["category_level_1"])


X = vect.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.2,random_state=42)

y_train1 = to_categorical(y_train)

['arts, culture, entertainment and media' 'conflict, war and peace'
 'crime, law and justice' 'disaster, accident and emergency incident'
 'economy, business and finance' 'education' 'environment' 'health'
 'human interest' 'labour' 'lifestyle and leisure' 'politics'
 'religion and belief' 'science and technology' 'society' 'sport'
 'weather']
We did it boys , labels have been created


In [40]:
model_6 = Sequential()
model_6.add(Dense(200,input_dim=200,activation = "leaky_relu"))
model_6.add(Dense(360,activation = "leaky_relu"))
model_6.add(tf.keras.layers.Dropout(0.2))
#model_3.add(Dense(60,activation = "leaky_relu"))
#model6.add(Dense(30,activation = "leaky_relu"))
model_6.add(Dense(17,activation = "softmax"))

In [41]:
opt = tf.keras.optimizers.AdamW(learning_rate =0.001 , beta_1=0.9,
    beta_2=0.999,
    use_ema=True,
    ema_momentum=0.99)

In [42]:
model_6.compile(optimizer = opt , 
              loss = 'categorical_crossentropy' ,
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

In [93]:
model_6.fit(X_train , y_train1 , batch_size = 16 , epochs = 1 , validation_split=(0.2) ,verbose =1)


predictions = np.argmax(model_6.predict(X_test), axis=-1)

print(predictions)


print(classification_report(y_test.values , predictions))

[14  1 16 ... 11  9 12]
              precision    recall  f1-score   support

           0       0.62      0.57      0.60        68
           1       0.80      0.78      0.79       176
           2       0.73      0.74      0.73        97
           3       0.67      0.73      0.70        90
           4       0.70      0.67      0.69        93
           5       0.66      0.82      0.73       108
           6       0.88      0.83      0.85       126
           7       0.73      0.71      0.72       136
           8       0.69      0.71      0.70       122
           9       0.84      0.69      0.76       155
          10       0.80      0.70      0.75        61
          11       0.65      0.63      0.64       172
          12       0.74      0.75      0.74       182
          13       0.66      0.66      0.66       151
          14       0.59      0.60      0.60       200
          15       0.88      0.91      0.89       169
          16       0.87      0.95      0.91        77

  

In [94]:
"""
1->64 , 2->67 , 3->68 , 4->69 , 5->70 , 6->71 ,8->72 , 11->73 , 13-> 74 , 15 ->74 , 22 ->75 , 50

"""

'\n1->64 , 2->67 , 3->68 , 4->69 , 5->70 , 6->71 ,8->72 , 11->73 , 13-> 74 , 15 ->74 , 22 ->75 , 50\n\n'

In [99]:
model_si = Sequential()
model_si.add(Dense(200,input_dim=200,activation = "leaky_relu"))
model_si.add(Dense(360,activation = "leaky_relu"))
model_si.add(tf.keras.layers.Dropout(0.2))
#model_si.add(Dense(60,activation = "leaky_relu"))
#model6.add(Dense(30,activation = "leaky_relu"))
model_si.add(Dense(17,activation = "sigmoid"))

In [100]:
opt = tf.keras.optimizers.AdamW(learning_rate =0.001 , beta_1=0.9,
    beta_2=0.999,
    use_ema=True,
    ema_momentum=0.99)

model_si.compile(optimizer = opt , 
              loss = 'categorical_crossentropy' ,
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                       tf.keras.metrics.Precision(name='precision'),
                       tf.keras.metrics.Recall(name='recall')])

In [107]:
model_si.fit(X_train , y_train1 , batch_size = 16 , epochs = 1 , validation_split=(0.2) ,verbose =1)


predictions = np.argmax(model_si.predict(X_test), axis=-1)

print(predictions)


print(classification_report(y_test.values , predictions))

[14  1 16 ... 11  9 12]
              precision    recall  f1-score   support

           0       0.59      0.50      0.54        68
           1       0.80      0.77      0.78       176
           2       0.71      0.75      0.73        97
           3       0.70      0.69      0.70        90
           4       0.76      0.57      0.65        93
           5       0.67      0.79      0.73       108
           6       0.83      0.80      0.82       126
           7       0.72      0.78      0.75       136
           8       0.68      0.64      0.66       122
           9       0.83      0.65      0.73       155
          10       0.67      0.66      0.66        61
          11       0.62      0.64      0.63       172
          12       0.72      0.82      0.77       182
          13       0.63      0.64      0.63       151
          14       0.59      0.56      0.57       200
          15       0.81      0.88      0.84       169
          16       0.83      0.99      0.90        77

  

In [None]:
"""
1->65 , 2->67 , 3->69, 4->69 ,5->70 ,6->71 
,8->72 , 11->73 , 13-> 74 , 15 ->74 , 22 ->75 , 50
"""