# Kmeans (n number of groups)
# Kmeans++
# DBscan



#RFM:
## R  : last buy 
## F   : count of buyed
## M  : mean all price 

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as PLT
import seaborn as SB
import datetime as dt
from sklearn.cluster import KMeans,DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.cluster import SpectralClustering
import nltk
import hazm


In [None]:
DF = pd.read_csv('../../datasets/OnlineRetail.csv', encoding= 'unicode_escape')
DF.shape

In [None]:
DF['InvoiceDate'] = DF['InvoiceDate'].astype(np.datetime64)
DF['InvoiceDay'] = DF['InvoiceDate'].dt.date
DF = DF[DF['CustomerID'].notnull()] 
DF['TotalSum'] = DF['Quantity'] * DF['UnitPrice'] 

In [None]:
DF.info()

In [None]:
DF.head()

In [None]:
pin_date = max(DF['InvoiceDay']) + dt.timedelta(1)
pin_date

In [None]:
RFM = DF.groupby('CustomerID').agg({
    'InvoiceDate' : lambda x : (pin_date - dt.date(x.max().year,x.max().month,x.max().day)).days,
#     'InvoiceNo' : lambda x : x.count(),
    'InvoiceNo' : 'count',
    'TotalSum'  : 'sum'
})

In [None]:
RFM.rename(columns={
    'InvoiceDate' : 'Recency',
    'InvoiceNo':'Frequency',
    'TotalSum':'Monetary'
},inplace=True)

### Ranking Bucket Scaling (Q Cut) : first sort data min to max then bucketing

In [None]:
r_labels = range(1,5,1)
RFM['R'] = pd.qcut(RFM['Recency'],q=4 , labels=r_labels)

f_labels = range(4,0,-1)
RFM['F']  = pd.qcut(RFM['Frequency'],q=4 , labels=f_labels)

m_labels = range(4,0,-1)
RFM['M']  = pd.qcut(RFM['Monetary'],q=4 , labels=m_labels)

In [None]:
X = RFM[['R','F','M']]

In [None]:
kmeans = KMeans(n_clusters=4,init='k-means++', max_iter=800)
kmeans.fit(X)

In [None]:
RFM['kmeans_cluster'] = kmeans.labels_

In [None]:
np.unique(kmeans.labels_,return_counts=True)

In [None]:
RFM

In [None]:
RFM.mean()

In [None]:
RFM[RFM['kmeans_cluster'] == 0 ].mean()

In [None]:
RFM[RFM['kmeans_cluster'] == 1 ].mean()

In [None]:
RFM[RFM['kmeans_cluster'] == 2 ].mean()

In [None]:
RFM[RFM['kmeans_cluster'] == 3 ].mean()

In [None]:
RFM[RFM['kmeans_cluster'] == 4 ].mean()

# metric : WCSS - ELBOW
### WCSS ===> less better

In [None]:
wcss = {}

for k in range(1,11):
    kmeans = KMeans(n_clusters=k,init='k-means++', max_iter=800)
    kmeans.fit(X)
    wcss[k] = kmeans.inertia_
SB.pointplot(x=list(wcss.keys()),y=list(wcss.values()))

# DBSCAN
## metric: silhouette (near 1 is better)

In [None]:
dbscan = DBSCAN(eps=.5,min_samples=200)
dbscan.fit(X)
np.unique(dbscan.labels_,return_counts=True)

In [None]:
dbscan = DBSCAN(eps=.5,min_samples=20)
dbscan.fit(X)
np.unique(dbscan.labels_)

In [None]:
silhouette_score(X,dbscan.labels_)

In [None]:
slhscores = {}

for k in range(40,100):
    dbscan = DBSCAN(eps=.5,min_samples=k)
    dbscan.fit(X)
    slhscores[k] = silhouette_score(X,dbscan.labels_)

In [None]:
PLT.figure(figsize=(15,8))
SB.pointplot(x=list(slhscores.keys()),y=list(slhscores.values()))

# Spectral clustring

In [None]:
spclcls = SpectralClustering()
spclcls.fit(X)
np.unique(spclcls.labels_,return_counts=True)

In [None]:
spclcls = SpectralClustering(n_clusters=4)
spclcls.fit(X)
np.unique(spclcls.labels_,return_counts=True)

In [None]:
silhouette_score(X,spclcls.labels_)

# MeanShift clustring

In [None]:
from sklearn.cluster import MeanShift

In [None]:
meanshiftcls = MeanShift()
meanshiftcls.fit(X)
np.unique(meanshiftcls.labels_,return_counts=True)

In [None]:
silhouette_score(X,meanshiftcls.labels_)

# Text Clustring

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
DF = pd.read_csv('../../datasets/cleaned_news.csv')
DF.head()

Unnamed: 0.1,Unnamed: 0,title_body,category
0,0,وزير علو درجمع استاد نمونه سن بازنشستگي استاد ...,آموزشي
1,1,گردهمايي دانش‌آموختگ موسسه آموز عالي سوره برگز...,آموزشي
2,2,نتايج آزمون دوره‌هاي فراگير دانشگاه پيام‌نور ن...,آموزشي
3,3,هماي يكروزه آسيب شناسي مفهو روابط عمومي بابلسر...,اجتماعي
4,4,وضعي اقتصادي ميز تحصيل والدين مهمترين عوامل مو...,آموزشي


In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
vectorizer.fit(DF['title_body'])
X = vectorizer.transform(DF['title_body'])

In [9]:
DF['category'].value_counts()

اجتماعي       1000
مذهبي         1000
آموزشي        1000
ورزشي         1000
علمي          1000
بهداشتي       1000
اقتصادي       1000
سياسي         1000
فرهنگي        1000
فقه و حقوق    1000
تاريخي         999
Name: category, dtype: int64

In [28]:
kmeans = KMeans(n_clusters=4,init='k-means++', max_iter=800)
kmeans.fit(X)

SpectralClustering(n_clusters=4)

In [29]:
np.unique(spclcls.labels_,return_counts=True)

(array([0, 1, 2, 3], dtype=int32), array([ 834, 1225, 8637,  303]))

In [30]:
silhouette_score(X,spclcls.labels_)

-0.00127064511413753

In [31]:
XL = DF[['title_body']]
XL['label'] = spclcls.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  XL['label'] = spclcls.labels_


In [22]:
from collections import Counter

In [32]:
all_words = ''.join(XL[XL['label']==1]['title_body']).split(' ')
Counter(all_words).most_common(100)

[('دانشگاه', 8436),
 ('اين', 7770),
 ('كه', 5600),
 ('دانشجوي', 3782),
 ('علو', 2622),
 ('كرد', 2594),
 ('براي', 2547),
 ('وي', 2437),
 ('علمي', 2324),
 ('آموزشي', 2280),
 ('آموز', 2279),
 ('كشور', 2274),
 ('پزشكي', 2083),
 ('دك', 2080),
 ('سال', 1918),
 ('اير', 1620),
 ('مي\u200cشود', 1584),
 ('بي', 1528),
 ('نيز', 1458),
 ('وزار', 1444),
 ('آزمون', 1379),
 ('بايد', 1329),
 ('تهر', 1264),
 ('يك', 1226),
 ('', 1201),
 ('ايسنا', 1137),
 ('اسلامي', 998),
 ('خبرگزاري', 996),
 ('بهدا', 966),
 ('ادامه', 961),
 ('آزاد', 940),
 ('كارشناسي', 927),
 ('عالي', 924),
 ('دانشجو', 921),
 ('گزار', 892),
 ('معاون', 855),
 ('اشاره', 851),
 ('دانشكده', 846),
 ('رييس', 840),
 ('همچنين', 839),
 ('اظهار', 833),
 ('دانشگاهي', 816),
 ('دانشجويي', 804),
 ('خبرنگار', 776),
 ('برگزار', 751),
 ('كار', 743),
 ('هي', 740),
 ('اينكه', 726),
 ('فرهنگي', 723),
 ('تحقيق', 722),
 ('تحصيل', 713),
 ('ساز', 701),
 ('ايجاد', 691),
 ('قرار', 689),
 ('دوره', 674),
 ('توسعه', 666),
 ('پذير', 664),
 ('توجه', 663),
 ('پژوهشي', 