In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from matplotlib import pyplot as plt
import json
import numpy as np
import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import pickle
from scipy.sparse import coo_matrix, hstack

from sklearn.metrics.cluster import homogeneity_score
from sklearn.metrics.cluster import v_measure_score
from sklearn.metrics.cluster import completeness_score

from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans, MiniBatchKMeans
import datetime
from datetime import timedelta

from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
total=pd.read_json("/content/drive/MyDrive/NLP_News/Data/total.json")
total.shape





(378370, 5)

In [None]:
total.columns = ['id', 'date','text','username','NE']
total.head()

Unnamed: 0,id,date,text,username,ALL
0,679451396844990465,2015-12-23 00:00:11,gulf nations demand release of abducted qatari...,ajenews,"[iraq, qataris, gulf nations]"
1,679451620514611201,2015-12-23 00:01:04,midway through enrollment season for president...,ap,[obama]
2,679452844307705856,2015-12-23 00:05:56,russia bombs kill syria civilians,bbcworld,"[syria, russia]"
3,679453367106678785,2015-12-23 00:08:01,six republican candidates expected in fox busi...,cnn,"[fox, republican, six]"
4,679453408324108288,2015-12-23 00:08:10,video bagpipes amid the bustle in tokyo,bbcworld,[tokyo]


In [None]:
import itertools
def find_important_vocab(dataset):

  NE_list = dataset.NE.to_list()
  NE_list = list(itertools.chain(*NE_list))
  NE_list = list(np.char.lower(NE_list))
  NE_list = list(set(NE_list))

  return NE_list

In [None]:
def tf_idf_vec(start,c_time,types):

  if types=="week":
    min_time=total.date[start]
  elif types=="half_week":
    min_time=total.date[start]+datetime.timedelta(days=int(c_time/2))

  middle_time=min_time+datetime.timedelta(days=int(c_time/2))
  dt = total[(total.date < min_time + datetime.timedelta(days=c_time)) & (total.date >= min_time)]
  ind = dt.index
  # print(ind)
  try:
    start =ind[0]
    end= ind[-1]
    print("start",start)
    print("end",end)

    important_vocab=find_important_vocab(dt)
  
    cv = TfidfVectorizer(stop_words="english",vocabulary=important_vocab)
    docs_total=dt["text"].tolist()
    total_tweets = cv.fit_transform(docs_total)

    # print(total_tweets.shape)

    return cv,total_tweets,end,start,middle_time

  except:
    print("The Week Is Empty!!!!")
    pass  
  return -1,-1,-1,-1,-1


In [None]:
def km_cluster(total_tweets):

  range_n_clusters = np.arange(int(3*total_tweets.shape[0]/8),int(total_tweets.shape[0]/2),22)
  scores=[]
  for n_clusters in range_n_clusters:
    # print("n_cluster",n_clusters)
  
    km= KMeans(n_clusters=n_clusters, random_state=0)
    km_labels = km.fit_predict(total_tweets)

    silhouette_avg = silhouette_score(total_tweets, km_labels)
    # print("score=",silhouette_avg)
    scores.append(silhouette_avg)
    if len(scores)>2:
      if scores[-1]==scores[-2]:
        break

  n_clusters=scores.index(max(scores))
  print("max_score",max(scores))
  n_clusters=range_n_clusters[n_clusters]
  km= KMeans(n_clusters=n_clusters, random_state=0)
  km_labels = km.fit(total_tweets)
  best_model =pickle.dumps(km)

  return best_model

In [None]:
def main_func(total,c_time,start_point):
  start=start_point
  i=1
  model_data= pd.DataFrame(columns=('week','middle_time','model','tf_idf', 'start_ind', 'end_ind'))
  print(total.shape[0])
  while start<=total.shape[0]+start_point:

    tf_model,total_tweets,end_ind,start_ind,middle_time=tf_idf_vec(start,c_time,types="week")
    cluster=km_cluster(total_tweets)

    new_row = {'week':i, 'middle_time':middle_time, 'model':cluster,'tf_idf':pickle.dumps(tf_model),
               'start_ind':start_ind,'end_ind':end_ind}
    model_data=model_data.append(new_row,ignore_index=True)

    tf_model_2,total_tweets_2,end_ind_2,start_ind_2,middle_time_2=tf_idf_vec(start,c_time,types="half_week")
    if end_ind_2==-1:
      i+1
      start=end_ind+1
      continue

    cluster_2=km_cluster(total_tweets_2)
    new_row = {'week':i+0.5, 'middle_time':middle_time_2, 'model':cluster_2,'tf_idf':pickle.dumps(tf_model_2),
               'start_ind':start_ind_2,'end_ind':end_ind_2}
    model_data=model_data.append(new_row,ignore_index=True)

    i+=1
    start=end_ind+1

  return model_data