# Import Packages

In [4]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

# Load Preprocessed Dataset 

In [5]:
df = pd.read_csv('data_prep.csv', usecols=['stemmed'])
df

Unnamed: 0,stemmed
0,"['jangkau', 'hubung', 'kavling', 'tanah', 'jua..."
1,"['akwowakoawkaow', 'kos', 'gawe', 'jakarta', '..."
2,"['damage', 'parah', 'banget', 'nder', 'take', ..."
3,"['goodbener', 'ajak', 'warga', 'dki', 'jakarta..."
4,"['delapan', 'dukung', 'anies', 'butuh', 'jakar..."
...,...
95,"['saking', 'otak', 'gagas', 'ngibul', 'ngarang..."
96,"['cari', 'tanah', 'kavling', 'bangun', 'kebun'..."
97,"['march', 'jakarta', 'indonesia', 'tickets', '..."
98,"['kenneth', 'perintah', 'pusat', 'coba', 'jali..."


# Combining Fractions of Words into One Sentence

In [6]:
def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])

df["tweet_join"] = df["stemmed"].apply(join_text_list)

df["tweet_join"].head()

0    jangkau hubung kavling tanah jual murah kavlin...
1                  akwowakoawkaow kos gawe jakarta wae
2    damage parah banget nder take all potong harga...
3    goodbener ajak warga dki jakarta gapai berkah ...
4    delapan dukung anies butuh jakarta perintah pusat
Name: tweet_join, dtype: object

# TF-IDF Scikit-Learn L2 Normalization

In [7]:
max_features = 200
tf_idf = TfidfVectorizer(max_features=max_features, binary=True)
tfidf_mat = tf_idf.fit_transform(df["tweet_join"]).toarray()

print("TF-IDF ", type(tfidf_mat), tfidf_mat.shape)

TF-IDF  <class 'numpy.ndarray'> (100, 200)


In [8]:
terms = tf_idf.get_feature_names()

# Pembobotan tfidf frekuensi setiap kata pada dokumen 
sums = tfidf_mat.sum(axis=0)

# Menghubungkan frekuensi kata pada dokumen
data = []
for col, term in enumerate(terms):
    data.append((term, sums[col] ))

ranking = pd.DataFrame(data, columns=['kata','rank'])
ranking = ranking.sort_values('rank', ascending=False)
ranking



Unnamed: 0,kata,rank
56,jakarta,17.212161
25,dki,3.208933
64,jual,3.112774
77,kerja,2.924317
4,anies,2.885955
...,...,...
125,personal,0.473568
178,testi,0.473568
144,rate,0.473568
146,rendah,0.473568


# TF-IDF Scikit-Learn L1 Normalization

In [9]:
max_features = 1000

# calc TF vector
cvect = CountVectorizer(max_features=max_features)
TF_vector = cvect.fit_transform(df["tweet_join"])

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
tfs = tfidf.fit_transform(df["tweet_join"])
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat = normalized_TF_vector.multiply(IDF_vector).toarray()
tfidf_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# TF-IDF vector unigram only / bigram only / trigram only

In [11]:
max_features = 1000

def generate_tfidf_mat(min_gram, max_gram):
    cvect = CountVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram))
    counts = cvect.fit_transform(df["tweet_join"])

    normalized_counts = normalize(counts, norm='l1', axis=1)

    tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram), smooth_idf=False)
    tfs = tfidf.fit_transform(df["tweet_join"])

    tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()
    
    TF = normalized_counts.toarray()
    IDF = tfidf.idf_
    TF_IDF = tfidf_mat
    return TF, IDF, TF_IDF, tfidf.get_feature_names()

# ngram_range (1, 1) to use unigram only
tf_mat_unigram, idf_mat_unigram, tfidf_mat_unigram, terms_unigram = generate_tfidf_mat(1,1)

# ngram_range (2, 2) to use bigram only
tf_mat_bigram, idf_mat_bigram, tfidf_mat_bigram, terms_bigram = generate_tfidf_mat(2,2)

# ngram_range (3, 3) to use trigram only
tf_mat_trigram, idf_mat_trigram, tfidf_mat_trigram, terms_trigram = generate_tfidf_mat(3,3)



In [12]:
#unigram
idx_sample = 1

print("Sampel TFIDF ke-" + str(idx_sample), "\n")
print(df["stemmed"][idx_sample], "\n")

print("\t\t\t", "TF", "\t\t", "IDF", "\t\t", "TF-IDF", "\t", "Term\n")
for i, item in enumerate(zip(tf_mat_unigram[idx_sample], idf_mat_unigram, tfidf_mat_unigram[idx_sample], terms_unigram)):
    if(item[2] != 0.0):
        print ("array position " + str(i) + "\t", 
               "%.6f" % item[0], "\t", 
               "%.6f" % item[1], "\t", 
               "%.6f" % item[2], "\t", 
               item[3])

Sampel TFIDF ke-1 

['akwowakoawkaow', 'kos', 'gawe', 'jakarta', 'wae'] 

			 TF 		 IDF 		 TF-IDF 	 Term

array position 14	 0.200000 	 5.605170 	 1.121034 	 akwowakoawkaow
array position 201	 0.200000 	 5.605170 	 1.121034 	 gawe
array position 262	 0.200000 	 1.301105 	 0.260221 	 jakarta
array position 330	 0.200000 	 5.605170 	 1.121034 	 kos
array position 669	 0.200000 	 5.605170 	 1.121034 	 wae


In [13]:
#bigram
idx_sample = 1

print("Sampel TFIDF ke-" + str(idx_sample), "\n")
print(df["stemmed"][idx_sample], "\n")

print("\t\t\t", "TF", "\t\t", "IDF", "\t\t", "TF-IDF", "\t", "Term\n")
for i, item in enumerate(zip(tf_mat_bigram[idx_sample], idf_mat_bigram, tfidf_mat_bigram[idx_sample], terms_bigram)):
    if(item[2] != 0.0):
        print ("array position " + str(i) + "\t", 
               "%.6f" % item[0], "\t", 
               "%.6f" % item[1], "\t", 
               "%.6f" % item[2], "\t", 
               item[3])

Sampel TFIDF ke-1 

['akwowakoawkaow', 'kos', 'gawe', 'jakarta', 'wae'] 

			 TF 		 IDF 		 TF-IDF 	 Term

array position 15	 0.250000 	 5.605170 	 1.401293 	 akwowakoawkaow kos
array position 232	 0.250000 	 5.605170 	 1.401293 	 gawe jakarta
array position 360	 0.250000 	 5.605170 	 1.401293 	 jakarta wae
array position 452	 0.250000 	 5.605170 	 1.401293 	 kos gawe


In [14]:
#trigram
idx_sample = 1

print("Sampel TFIDF ke-" + str(idx_sample), "\n")
print(df["stemmed"][idx_sample], "\n")

print("\t\t\t", "TF", "\t\t", "IDF", "\t\t", "TF-IDF", "\t", "Term\n")
for i, item in enumerate(zip(tf_mat_trigram[idx_sample], idf_mat_trigram, tfidf_mat_trigram[idx_sample], terms_bigram)):
    if(item[2] != 0.0):
        print ("array position " + str(i) + "\t", 
               "%.6f" % item[0], "\t", 
               "%.6f" % item[1], "\t", 
               "%.6f" % item[2], "\t", 
               item[3])

Sampel TFIDF ke-1 

['akwowakoawkaow', 'kos', 'gawe', 'jakarta', 'wae'] 

			 TF 		 IDF 		 TF-IDF 	 Term

array position 15	 0.333333 	 5.605170 	 1.868390 	 akwowakoawkaow kos
array position 223	 0.333333 	 5.605170 	 1.868390 	 freong keep
array position 439	 0.333333 	 5.605170 	 1.868390 	 kesiap helat


# Saving Pandas Dataframe to CSV File

In [15]:
df.to_csv('data_tfidf.csv')