# Import Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from google.colab import data_table
pd.set_option('display.max_columns', 30)

In [None]:
import seaborn as sns

# Read Data

In [None]:
# Cleaning metadata
metadata = pd.read_excel('/content/drive/MyDrive/satria_data_penyisihan_2/metadata.xlsx')
metadata.drop(0, inplace = True)

# Ambil fitur kolom dan fitur kategorikal
column_features = metadata.iloc[:21,:]
column_features.columns = ['Variable','Index','Value']

before_values = ['Kode dan Nama diagnosis berdasarkan ICD-10 (3 digit)', 'Kode diagnosis berdasarkan ICD-10 (3 digit)',
                 'Kode diagnosis (3-5 digit)']
after_values = ['Kode Nama Diagnosis ICD 10','Kode Diagnosis ICD 10','Kode Diagnosis']
column_features['Value'] = column_features['Value'].replace(before_values, after_values)

categorical_features = metadata.iloc[25:445, :]
categorical_features.fillna(method = 'ffill', inplace = True)
categorical_features.columns = ['Variable','Encoding','Value']

In [None]:
train = pd.read_stata('/content/drive/MyDrive/satria_data_penyisihan_2/2019202004_nonkapitasi.dta')

for i in categorical_features['Variable'].unique():
  print(train[i].unique())
  print('')
train.columns = np.array(column_features['Value'][:21])


In [None]:
train.info()

# Data Check

In [None]:
data_table.DataTable(train, include_index=False, num_rows_per_page=100,max_columns=21)

## Missing values

In [None]:
train.isnull().sum()*100/train.shape[0]

## Proporsi tiap kategori

In [None]:
cols = [col for col in train.columns if train[col].dtypes == 'object' and col!='ID Kunjungan' or train[col].dtypes == 'category' ]

In [None]:
for i in range(len(cols)):
  print(train[cols[i]].value_counts()*100/train.shape[0])
  print('')

In [None]:
train['Provi']

### Kategori jenis faskes dan tipe faskes

#### Cek kategori laboratorium


In [None]:
lab=train[train['Jenis faskes']=='LABORATORIUM'][['Jenis faskes','Tipe faskes']]

In [None]:
lab['Tipe faskes'].unique()

#### Cek dokter umum

In [None]:
lab=train[train['Jenis faskes']=='DOKTER UMUM'][['Jenis faskes','Tipe faskes']]

In [None]:
lab['Tipe faskes'].unique()

#### Cek jejaring

In [None]:
lab=train[train['Jenis faskes']=='JEJARING'][['Jenis faskes','Tipe faskes']]

In [None]:
lab['Tipe faskes'].unique()

#### Cek puskesmas

In [None]:
lab=train[train['Jenis faskes']=='PUSKESMAS'][['Jenis faskes','Tipe faskes']]

In [None]:
lab['Tipe faskes'].unique()

#### Cek klinik pratama

In [None]:
lab=train[train['Jenis faskes']=='KLINIK PRATAMA'][['Jenis faskes','Tipe faskes']]

In [None]:
lab['Tipe faskes'].unique()

Kesimpulan 

- laboratorium --> laboratorium
- dokter umum -->  dokter praktek perorangan
- jejaring -->  ppk lain lain
- puskemas --> rawat inap dan non rawat inap
- klinik pratama --> klinik rawat inap, klinik non rawat inap, rs kelas d pratama

## Data duplikat

In [None]:
train[train.duplicated()]

In [None]:
train_drop=train.drop(['Nomor Peserta','Nomor keluarga','ID Kunjungan'],axis=1)
train_drop[train_drop.duplicated()].shape

In [None]:
train[train['Tipe faskes'] == 'NON RAWAT INAP']

# Exploratory Data Analysis

## Biaya Tagihan

In [None]:
sns.boxplot(train['Biaya tagih']/10**6)

In [None]:
train['Biaya tagih'].mean()

## Demografi Pasien

In [None]:
!pip install geopandas

In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd

In [None]:
!git clone https://github.com/superpikar/indonesia-geojson.git

In [None]:
path = '/content/indonesia-geojson/indonesia-edit.geojson'
df_geo = gpd.read_file(path)
df_geo['state'] = df_geo['state'].str.upper()
df_geo['state'] = df_geo['state'].replace(['BANGKA-BELITUNG','YOGYAKARTA','JAKARTA RAYA','IRIAN JAYA BARAT'],
                                          ['KEPULAUAN BANGKA BELITUNG','DAERAH ISTIMEWA YOGYAKARTA','DKI JAKARTA','PAPUA BARAT'])
display(df_geo)

In [None]:
dict_geom = dict(zip(df_geo['state'], df_geo['geometry']))
dict_geom

In [None]:
# Inner Join dengan data train
geo_data = df_geo.merge(train, how = 'inner', left_on = 'state', right_on = "Provinsi faskes")
geo_data

In [None]:
# Jumlah Populasi
from geopandas import GeoDataFrame
count_pop = GeoDataFrame(geo_data.groupby(['state'])['Nomor Peserta'].count().reset_index())
count_pop['geometry'] = count_pop['state'].map(dict_geom)
count_pop.columns = ['Provinsi','Jumlah Pasien','geometry']

# Biaya pengobatan
biaya_pop = GeoDataFrame(geo_data.groupby(['state'])['Biaya tagih'].mean().reset_index())
biaya_pop['geometry'] = biaya_pop['state'].map(dict_geom)
biaya_pop.columns = ['Provinsi','Biaya tagih','geometry']

In [None]:
count_pop.sort_values('Jumlah Pasien',ascending=False)

In [None]:
biaya_pop.sort_values('Biaya tagih',ascending=False)

In [None]:
# set a variable that will call whatever column we want to visualise on the map
values = 'Jumlah Pasien'

# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(25,10), facecolor='lightblue')

# remove the axis
ax.axis('off')

# add a title
title = 'Jumlah Peserta BPJS Per Provinsi'.format(values)
ax.set_title(title, fontdict={'fontsize': '25', 'fontweight' : '3'})

# create an annotation for the data source
ax.annotate('Source: BPJS Kesehatan',xy=(0.1, .08),  xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=12 ,color='#555555')

# create map
count_pop.plot(column=values, cmap='viridis', linewidth=0.8, ax=ax, edgecolor='0.8', legend = True)

In [None]:
# set a variable that will call whatever column we want to visualise on the map
values = 'Biaya tagih'

# create figure and axes for Matplotlib
fig, ax = plt.subplots(1, figsize=(25, 10), facecolor='lightblue')

# remove the axis
ax.axis('off')

# add a title
title = 'Rerata Biaya Tagih Peserta BPJS per Provinsi'.format(values)
ax.set_title(title, fontdict={'fontsize': '25', 'fontweight' : '3'})

# create an annotation for the data source
ax.annotate('Source: BPJS Kesehatan',xy=(0.1, .08),  xycoords='figure fraction', horizontalalignment='left', verticalalignment='top', fontsize=16 ,color='#555555')

# create map
biaya_pop.plot(column=values, cmap='viridis', linewidth=0.8, ax=ax, edgecolor='0.8', legend = True)

## Lama Tanggal Kunjungan dan Tanggal tindakan

In [None]:
train['Tanggal kunjungan'].dt.year.value_counts()  # daset tahun 2018, 2019,2020

In [None]:
# Selisih Tanggal tindakan dan tanggal kunjungan
train['Tanggal kunjungan'] = pd.to_datetime(train['Tanggal kunjungan'])
train['Tanggal tindakan'] = pd.to_datetime(train['Tanggal tindakan'])
train['Tanggal pulang'] = pd.to_datetime(train['Tanggal pulang'])

# Selisih waktu
train['Selisih Waktu Pelayanan'] = train['Tanggal tindakan'] - train['Tanggal kunjungan']
train['Selisih Waktu Pelayanan'] = train['Selisih Waktu Pelayanan'].dt.days

In [None]:
train['Tipe faskes'].unique()

In [None]:
non_rawat_inap = train[train['Tipe faskes'].isin(['NON RAWAT INAP','KLINIK NON RAWAT INAP'])] 
non_rawat_inap[non_rawat_inap['Selisih Waktu Pelayanan'] > 0]

In [None]:
# Selisih Tanggal tindakan dan tanggal kunjungan
geo_data['Tanggal kunjungan'] = pd.to_datetime(geo_data['Tanggal kunjungan'])
geo_data['Tanggal tindakan'] = pd.to_datetime(geo_data['Tanggal tindakan'])
geo_data['Tanggal pulang'] = pd.to_datetime(geo_data['Tanggal pulang'])

In [None]:
# Selisih waktu
geo_data['Selisih Waktu Pelayanan'] = geo_data['Tanggal tindakan'] - geo_data['Tanggal kunjungan']
geo_data['Selisih Waktu Pelayanan'] = geo_data['Selisih Waktu Pelayanan'].dt.days

In [None]:
groupby_selisih_waktu = GeoDataFrame(geo_data.groupby(['state'])['Selisih Waktu Pelayanan'].mean().reset_index())
groupby_selisih_waktu['geometry'] = groupby_selisih_waktu['state'].map(dict_geom)
groupby_selisih_waktu.columns = ['Provinsi','Selang Waktu Kunjungan dan Tindakan','geometry']
groupby_selisih_waktu

##Distribusi Plot

In [None]:
plt.figure(figsize = (15,10))
train.groupby(['Jenis faskes'])['Biaya tagih'].mean().sort_values().plot(kind = 'barh')
plt.title("Rerata Biaya Tagih Peserta Berdasarkan Kategori Jenis Faskes")
plt.xlabel('Rerata Biaya Tagih Peserta BPJS (Rupiah)')
plt.ylabel('Jenis Faskes')
plt.show()

In [None]:
plt.figure(figsize = (15,10))
train.groupby(['Segmen peserta'])['Biaya tagih'].mean().sort_values().plot(kind = 'barh')
plt.title("Rerata Biaya Tagih Peserta Berdasarkan Kategori Segmen Peserta")
plt.xlabel('Rerata Biaya Tagih Peserta BPJS (Rupiah)')
plt.ylabel('Segmen Peserta')
plt.show()

In [None]:
plt.figure(figsize = (15,10))
train.groupby(['Kepemilikan faskes'])['Biaya tagih'].mean().sort_values().plot(kind = 'barh')
plt.title("Rerata Biaya Tagih Peserta Berdasarkan Kategori Kepemilikan Faskes")
plt.xlabel('Rerata Biaya Tagih Peserta BPJS (Rupiah)')
plt.ylabel('Kepemilikan Faskes')
plt.show()

# Sentiment Analysis

## Import Data
* Twitter
* Berita

In [None]:
!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
!pip install snscrape

In [None]:
!pip install snscrape

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

query = 'bpjs min_replies:0 min_faves:0 lang:id until:2022-11-02 since:2021-01-01'
tweets = []
limit = 100

def scrap(tweets,limit,qeuery):

  for tweet in sntwitter.TwitterSearchScraper(query).get_items():
      # print(vars(tweet))
      # break
      if len(tweets) == limit:
          break
      else:
          tweets.append([tweet.date, tweet.username, tweet.content])
          
  df = pd.DataFrame(tweets, columns=['Date', 'User', 'Tweet'])
  return df

In [None]:
query = 'bpjs min_replies:0 min_faves:0 lang:id until:2022-11-02 since:2021-01-01'
tweets = []
limit = 100

## twitter impor data by province

In [None]:
import itertools

In [None]:
indonesia=pd.read_csv('/content/drive/MyDrive/satria_data_penyisihan_2/id.csv')
indonesia.drop(['iso2'],axis=1,inplace=True)
indonesia=indonesia.drop_duplicates(subset='city')
indonesia=indonesia[indonesia.capital.isin(['admin','primary'])]
indonesia.rename(columns={'admin_name':'provinsi'}, inplace=True)
indonesia.rename(columns={'city':'ibu_kota'}, inplace=True)
indonesia.reset_index(inplace=True)
indonesia.drop('index',axis=1,inplace=True)

In [None]:
luas=pd.read_excel('/content/drive/MyDrive/satria_data_penyisihan_2/luas_provinsi.xlsx')
luas.dropna(inplace=True)
luas.columns=['provinsi','ibu_kota','luas','persentase','jumlah_pulau']
luas['provinsi'].replace('DKI Jakarta','Jakarta',inplace=True)
luas['provinsi'].replace('DI Yogyakarta','Yogyakarta',inplace=True)

In [None]:
in_lus=indonesia.merge(luas,on='provinsi')
in_lus=in_lus[['provinsi','luas','lat','lng','ibu_kota_x']]
in_lus['luas']=in_lus['luas'].str.replace(' ','')
in_lus['luas']=in_lus['luas'].str.replace(',','.')
in_lus['luas']=in_lus['luas'].astype('float')
in_lus['radius']=np.sqrt(in_lus['luas']/np.pi)

In [None]:
in_lus=pd.read_excel('/content/drive/MyDrive/satria_data_penyisihan_2/inlus.xlsx')

In [None]:
koprov=dict(zip(in_lus['provinsi'].array,in_lus['ibu_kota_x'].array))
kolat=dict(zip(in_lus['provinsi'].array,in_lus['lat'].array)) # latitude
kolong=dict(zip(in_lus['provinsi'].array,in_lus['lng'].array)) # longtitude
kora=dict(zip(in_lus['provinsi'].array,in_lus['radius'].array)) #radius
kolau=dict(zip(in_lus['provinsi'].array,in_lus['pulau'].array)) #pulau

In [None]:
df=pd.DataFrame() #define empyt dataframe
empty_c=[]
a=1
#800km ,1000 data --> kalau dalam segini ga da data duplikat berarti kurang  banyak datanya , kalau banyak duplikat berrati 
for i in kolat.keys():
  if kolau[i]=='jawa':
    loc=str(kolat[i])+str(', ')+str(kolong[i])+str(', ')+str(10)+'km'
    df_twit=pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper('bpjs until:2022-11-05 since:2021-01-01 geocode:"{}"'.format(loc)).get_items(), 1000))
    if df_twit.empty:
      #break oneloop
      empty_c.append(i)
      continue
    else:
      df_twit=df_twit[['username', 'date','content']]
      df_twit['provinsi']=i
      df=pd.concat([df,df_twit])
    print(a)
    a+=1
  elif kolau[i]=='sumatera':
      loc=str(kolat[i])+str(', ')+str(kolong[i])+str(', ')+str(15)+'km'
      df_twit=pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper('bpjs until:2022-11-05 since:2021-01-01 geocode:"{}"'.format(loc)).get_items(), 1000))
      if df_twit.empty:
        #break oneloop
        empty_c.append(i)
        continue
      else:
        df_twit=df_twit[['username', 'date','content']]
        df_twit['provinsi']=i
        df=pd.concat([df,df_twit])
      print(a)
      a+=1
  elif kolau[i]=='kalimantan':
      loc=str(kolat[i])+str(', ')+str(kolong[i])+str(', ')+str(50)+'km'
      df_twit=pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper('bpjs until:2022-11-05 since:2021-01-01 geocode:"{}"'.format(loc)).get_items(), 1000))
      if df_twit.empty:
        #break oneloop
        empty_c.append(i)
        continue
      else:
        df_twit=df_twit[['username', 'date','content']]
        df_twit['provinsi']=i
        df=pd.concat([df,df_twit])
      print(a)
      a+=1
  elif kolau[i]=='sulawesi':
      loc=str(kolat[i])+str(', ')+str(kolong[i])+str(', ')+str(50)+'km'
      df_twit=pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper('bpjs until:2022-11-05 since:2021-01-01 geocode:"{}"'.format(loc)).get_items(), 1000))
      if df_twit.empty:
        #break oneloop
        empty_c.append(i)
        continue
      else:
        df_twit=df_twit[['username', 'date','content']]
        df_twit['provinsi']=i
        df=pd.concat([df,df_twit])
      print(a)
      a+=1
  elif kolau[i]=='nusa tenggara' or kolau[i]=='papua' or kolau[i]=='maluku':
      loc=str(kolat[i])+str(', ')+str(kolong[i])+str(', ')+str(100)+'km'
      df_twit=pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper('bpjs until:2022-11-05 since:2021-01-01 geocode:"{}"'.format(loc)).get_items(), 1000))
      if df_twit.empty:
        #break oneloop
        empty_c.append(i)
        continue
      else:
        df_twit=df_twit[['username', 'date','content']]
        df_twit['provinsi']=i
        df=pd.concat([df,df_twit])
      print(a)
      a+=1
  else:
      loc=str(kolat[i])+str(', ')+str(kolong[i])+str(', ')+str(10)+'km'
      df_twit=pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper('bpjs until:2022-11-05 since:2021-01-01 geocode:"{}"'.format(loc)).get_items(), 1000))
      if df_twit.empty:
        #break oneloop
        empty_c.append(i)
        continue
      else:
        df_twit=df_twit[['username', 'date','content']]
        df_twit['provinsi']=i
        df=pd.concat([df,df_twit])
      print(a)
      a+=1


## Analisis Data Twitter

In [None]:
tweet = pd.read_csv('/content/drive/MyDrive/satria_data_penyisihan_2/twitter.csv')
tweet.drop(['Unnamed: 0'], axis = 1, inplace = True)   

In [None]:
tweet.drop('User', axis = 1, inplace = True)

In [None]:
tweet

Info kehilangan, dompet berisi uang, ktp a.n oky dwi prastyo, stnk, atm bca, bpjs, sim c 
Hilang disekitaran baleharjo wonosari
@KabarGunkid https://t.co/l9uc3XIA3x

Yang perlu dihilangin:
- Tag orangnya
- Hashtag
- Link
- \n

In [None]:
import seaborn as sns  
import matplotlib.pyplot as plt

In [None]:
# Visualisasi jumlah kata pada tweet
def word_count(sentence):
  return len(sentence.split())

tweet['count_words'] = tweet['Tweet'].apply(lambda teks: word_count(teks))
sns.distplot(tweet['count_words'])
plt.title('Distribusi Jumlah Kata pada Data Twitter')
plt.xlabel('Jumlah Kata')
plt.show()

In [None]:
tweet['Date'] = pd.to_datetime(tweet['Date'])
tweet['Date-Only'] = tweet['Date'].dt.date
tweet['Month'] = tweet['Date'].dt.month

In [None]:
plt.figure(figsize = (15,10))
tweet.groupby('Date-Only')['Tweet'].count().plot()
plt.title('Jumlah Post Tweet Per Tanggal')
plt.show()

In [None]:
tweet.set_index('Date').resample('M').count()['Tweet'].plot()
plt.xlabel('Bulan')
plt.ylabel('Jumlah Post')
plt.title('Jumlah Post Tweet BPJS per Bulan')
plt.show()

### Text Cleaning

In [None]:
import re
# Cara dapet @
print(re.findall('(\n|a.n.)', tweet['Tweet'][0]))
# Linkn

In [None]:
def bersih_bersih(sentence):
  '''Membuat kalimat menjadi huruf kecil semua, menghilangkan clickbait seperti [PROMO], tanggal di awal SMS, jumlah tarif, pulsa'''
  sentence = sentence.lower()
  sentence = re.sub('@\w+','', sentence)
  sentence = re.sub('#\w+','', sentence)
  sentence = re.sub('https?://\S*|http\.\S+','', sentence)
  sentence = re.sub('(\n|a.n.)','', sentence)
  sentence = sentence.strip()
  return sentence

In [None]:
text_cleaned = tweet['Tweet'].apply(lambda sentence: bersih_bersih(sentence))

In [None]:
tweet['Tweet Cleaned'] = text_cleaned

In [None]:
tweet['count_words_cleaned'] = tweet['Tweet Cleaned'].apply(lambda teks: word_count(teks))
sns.distplot(tweet['count_words_cleaned'])
plt.title('Distribusi Jumlah Kata pada Data Twitter Setelah Disederhanakan')
plt.xlabel('Jumlah Kata')
plt.show()

### Hilangin Stopwords

In [None]:
!pip install PySastrawi

In [None]:
import spacy
from spacy.lang.id.stop_words import STOP_WORDS
nlp = spacy.blank('id')
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import ngrams

In [None]:
STOP_WORDS.update(['yg','jg','teh','mah','da','atuh','jd','km','ak','lg','ya','ga','ngga','nggak','gak','tp',
                   'kalo','nya','pake','liat','udh','aja','wkwk','wkwkwk','wk','gt','gais','blm','sih','tau',
                   'tahu','gt','udah','utk','rb','rp','dgn','ayo','isi','biar','yah','dr','bawa','gitu','eh',
                   'pas','td','sm','pengen','pgn','dpt','sd','byr','min','dscn','sy','no','sok'])

In [None]:
def remove_stopwords(sentence):
  words = sentence.split()
  words = [word for word in words if word not in STOP_WORDS and word.isalpha()]
  return " ".join(words)

In [None]:
tweet['Tweet Cleaned'] = tweet['Tweet Cleaned'].apply(remove_stopwords)

In [None]:
tweet['Tweet Cleaned'][9]

In [None]:
tweet['Tweet'][4]

## Sentiment Classifier with Roberta Base
https://huggingface.co/w11wo/indonesian-roberta-base-indolem-sentiment-classifier-fold-0?text=Pelayanan+hotel+ini+sangat+baik.

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline
pretrained_name = "w11wo/indonesian-roberta-base-indolem-sentiment-classifier-fold-0"

nlp = pipeline(
    "sentiment-analysis",
    model=pretrained_name,
    tokenizer=pretrained_name)

nlp("Pelayanan hotel ini sangat baik.")

In [None]:
df['Label'] = df['Tweet'].apply(lambda value: nlp(value)[0]['label'])

In [None]:
label_1 = df[df['Label'] == 'LABEL_1'].reset_index(drop = True)
label_1['Tweet'][5]

## Sentiment Classifier with BERT Base

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline

pretrained_name = "ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa"
nlp = pipeline("sentiment-analysis", model=pretrained_name, tokenizer=pretrained_name)

In [None]:
tweet['Label'] = tweet['Tweet Cleaned'].apply(lambda value: nlp(value)[0]['label'])

In [None]:
tweet['Label'].value_counts()

In [None]:
tweet.to_excel('/content/drive/MyDrive/satria_data_penyisihan_2/tweet_labeling.xlsx', index = False)

### Result

## WordCloud

In [None]:
!pip install PySastrawi

In [None]:
from collections import Counter
from nltk import ngrams
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import ngrams
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from spacy.lang.id.stop_words import STOP_WORDS
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
def word_frequency(sentence, grams):
  sentence = " ".join(sentence)
  new_tokens = word_tokenize(sentence)
  counted = Counter(ngrams(new_tokens, grams))
  word_freq = pd.DataFrame(counted.items(), columns = ['Kata','Frekuensi']).sort_values(by = 'Frekuensi',ascending = False)
  return word_freq

In [None]:
from PIL import Image
mask_tweet = np.array(Image.open('/content/twitter.png'))

def wordcloud(sentence, title):
  sentence = " ".join(sentence)
  WC = WordCloud(background_color = 'white', max_words=2000, contour_width=3, contour_color='steelblue', mask = mask_tweet)
  cloud = WC.generate(sentence)
  plt.figure(figsize=(15,10))
  plt.imshow(cloud, interpolation = "bilinear")
  plt.title(title)
  plt.axis("off")
  plt.show()

In [None]:
tweet = pd.read_excel('/content/drive/MyDrive/satria_data_penyisihan_2/tweet_labeling.xlsx')

In [None]:
tweet

In [None]:
tweet.to_excel('tweet.xlsx', index = False)

In [None]:
sns.countplot(tweet['Label'], )
plt.xlabel('Polaritas')
plt.ylabel('Jumlah Post')
plt.title('Persebaran Sentimen dari Post Tweet BPJS')
plt.show()

## Further Cleaning

In [None]:
def remove_nonalphanumeric(sentence):
  sentence = re.sub('[^a-zA-Z0-9]'," ", sentence)
  words = sentence.split()
  words = [word for word in words if word not in STOP_WORDS and word.isalnum()]
  return " ".join(words)

In [None]:
import string
tweet['Tweet Cleaned'] = tweet['Tweet Cleaned'].fillna('')

In [None]:
import string
tweet['Tweet Cleaned'] = tweet['Tweet Cleaned'].apply(lambda teks: remove_nonalphanumeric(teks))

In [None]:
def remove_stopwords(sentence):
  words = sentence.split()
  words = [word for word in words if word not in STOP_WORDS and word.isalpha()]
  return " ".join(words)

In [None]:
STOP_WORDS.update(['yg','jg','teh','mah','da','atuh','jd','km','ak','lg','ya','ga','ngga','nggak','gak','tp',
                   'kalo','nya','pake','liat','udh','aja','wkwk','wkwkwk','wk','gt','gais','blm','sih','tau',
                   'tahu','gt','udah','utk','rb','rp','dgn','ayo','isi','biar','yah','dr','bawa','gitu','eh',
                   'pas','td','sm','pengen','pgn','dpt','sd','byr','min','dscn','sy','no','gw','bgt','lu','tdk','dll','dg',
                   'org','skrg','krn','amp','klo','krn'])

In [None]:
tweet['Tweet Cleaned'] = tweet['Tweet Cleaned'].apply(lambda teks: remove_stopwords(teks))

In [None]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def lemmatizer(sentence):
  output = stemmer.stem(sentence)
  return output

tweet['Tweet Cleaned Lemmatized'] = tweet['Tweet Cleaned'].apply(lemmatizer)

## Adjective Word Detection

In [None]:
!pip install --upgrade flair

In [None]:
import flair.datasets
corpus = flair.datasets.UD_INDONESIAN()

In [None]:
tag_type = 'upos'
upos_dictionary = corpus.make_label_dictionary(label_type=tag_type)

In [None]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, BertEmbeddings
from typing import List
embedding_types = [WordEmbeddings('id-crawl'),WordEmbeddings('id')]
embeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
from flair.models import SequenceTagger
tagger = SequenceTagger(hidden_size=256,embeddings=embeddings,
                                      tag_dictionary=upos_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
from flair.trainers import ModelTrainer
trainer = ModelTrainer(tagger, corpus)
trainer.train('resources/taggers/example-universal-pos',
 learning_rate=0.1,
 mini_batch_size=32,
 max_epochs=10)

In [None]:
!zip -r /content/drive/MyDrive/satria_data_penyisihan_2/pretrained.zip /content/resources

In [None]:
!unzip /content/drive/MyDrive/satria_data_penyisihan_2/pretrained.zip

In [None]:
from flair.data import Sentence
sentence = Sentence('saya dan dia kemarin pegi ke pasar bersama untuk membeli jeru')
tag_pos = SequenceTagger.load('resources/taggers/example-universal-pos/best-model.pt')
tag_pos.predict(sentence)
print(sentence.to_tagged_string())

In [None]:
tag_pos = SequenceTagger.load('content/resources/taggers/example-universal-pos/best-model.pt')

In [None]:
def find_adjective(sentence):
  sentence_array = Sentence(sentence)
  tag_pos.predict(sentence_array)
  adj_words = re.findall('"\w+"/ADJ', sentence_array.to_tagged_string())
  adj_words = [re.sub('/ADJ','', word) for word in adj_words]
  adj_words = [re.sub('"','', word) for word in adj_words]
  return " ".join(adj_words)

In [None]:
tweet['Adjective Words'] = tweet['Tweet Cleaned Lemmatized'].apply(lambda text: find_adjective(text))

In [None]:
tweet['Adjective Words'] = tweet['Adjective Words'].str.strip()

## Data Splitting

In [None]:
neutral = tweet[tweet['Label'] == 'Neutral'].reset_index(drop = True)
positive = tweet[tweet['Label'] == 'Positive'].reset_index(drop = True)
negative = tweet[tweet['Label'] == 'Negative'].reset_index(drop = True)

#### Filter kata adjektif sesuai sentimen polaritasnya

##### Negatif

In [None]:
all_sentence = " ".join(negative['Adjective Words'])
dict_words = dict(Counter(word_tokenize(all_sentence)))
kata = dict_words.keys()
jumlah = dict_words.values()
dict_df = {'Kata': kata,
           'Jumlah' : jumlah}
negative_df = pd.DataFrame(dict_df)
# Pake model BERT Base
negative_df['Label'] = negative_df['Kata'].apply(lambda value: nlp(value)[0]['label'])

In [None]:
negative_df['Label'].value_counts()

In [None]:
negative_labeled = negative_df[negative_df['Label'] == 'Negative']
wordcloud(negative_labeled['Kata'], "Word Cloud Untuk Sentimen Negatif")

In [None]:
neg_pos_labeled = negative_df[negative_df['Label'] == 'Positive']
wordcloud(neg_pos_labeled['Kata'], "Word Cloud Untuk Kata-Kata Positif pada Sentimen Negatif")

##### Positive

In [None]:
all_sentence = " ".join(positive['Adjective Words'])
dict_words = dict(Counter(word_tokenize(all_sentence)))
kata = dict_words.keys()
jumlah = dict_words.values()
dict_df = {'Kata': kata,
           'Jumlah' : jumlah}
positive_df = pd.DataFrame(dict_df)
# Pake model BERT Base
positive_df['Label'] = positive_df['Kata'].apply(lambda value: nlp(value)[0]['label'])

In [None]:
positive_labeled = positive_df[positive_df['Label'] == 'Positive']
wordcloud(positive_labeled['Kata'], "Word Cloud Untuk Sentimen Positif")

In [None]:
pos_neg_labeled = positive_df[positive_df['Label'] == 'Negative']
wordcloud(pos_neg_labeled['Kata'], "Word Cloud Untuk Kata-Kata Negatif yang Muncul pada Sentimen Positif")