<a href="https://colab.research.google.com/github/sumeyyedemir5/nlp-preprocessing_and_textRepresentation/blob/main/Word_Embedding_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gensim

# **Maksimum Entropi Modelleri**


In [None]:
from nltk.classify import MaxentClassifier
train_data = [
    ({"love":True, "amazing":True}, "positive"),
    ({"hate":True, "terrible":True}, "negative"),
    ({"happy":True, "joy":True}, "positive"),
    ({"sad":True, "depressed":True}, "negative")
]

classifier = MaxentClassifier.train(train_data, max_iter = 10)
test_sentence = "I like this amazing movie"

features = {word: (word in test_sentence.lower().split()) for word in ["love","hate","terrible","happy","joy","sad","depressed"]}

label = classifier.classify(features)
label

# WORD EMBEDDİNGS


**WORD2VEC Temel Modelleri**
*  CBOW

bir kelimenin bağlamındaki diğer kelimeleri kullanarak o kelimeyi tahmin etmeyi hedefler.
*   Skip-gram model

CBOW'un tam tersidir. Bir kelimeyi kullanarak o kelimenin bağlamında yer alan kelimeleri tahmin etmeyi hedefler.



**Recurrent Neural Networks (RNN)**

*her zaman adımında önceki zaman adımındaki bilgiyi saklayarak ve sonraki adımlarla bu bilgiyi güncelleyerek çalışırlar.*



In [None]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense , Embedding
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data = {
    "text": [
        "I absolutely loved the movie, it was fantastic!",
        "The plot was boring and predictable.",
        "Great performances by the lead actors.",
        "I didn't enjoy the film at all.",
        "The cinematography was stunning and beautiful.",
        "Terrible script and poor direction.",
        "An emotional rollercoaster that kept me engaged.",
        "The movie was too long and dragged on.",
        "A masterpiece that exceeded my expectations.",
        "I regret watching this movie.",
        "The soundtrack was amazing and memorable.",
        "The acting was wooden and unconvincing.",
        "I laughed throughout, such a fun movie!",
        "I almost fell asleep, it was that dull.",
        "Brilliant storytelling and plot twists.",
        "Disappointing and underwhelming.",
        "The visuals were breathtaking and immersive.",
        "Horrible pacing and confusing plot.",
        "I was moved by the heartfelt scenes.",
        "It felt like a waste of time.",
        "Excellent character development.",
        "Poor editing and awkward transitions.",
        "The movie had a perfect balance of humor and drama.",
        "I couldn't relate to any of the characters.",
        "An inspiring and uplifting story.",
        "The dialogues were cringe-worthy.",
        "Highly entertaining and captivating from start to finish.",
        "It was cliché and uninspired.",
        "I would definitely watch it again.",
        "I wish I hadn't spent money on this film."
    ],
    "label": [
        "positive", "negative", "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative", "positive", "negative"
    ]
}

df = pd.DataFrame(data)
# tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"]) # Cümleler sayı dizilerine dönüşür: "I love" -> [5, 12]
word_index = tokenizer.word_index # Kelime-Sayı eşleşmesini saklar

# Padding: Tüm cümleleri aynı uzunluğa getirir. RNN sabit boyutlu girdi ister.
maxlen = max(len(seq) for seq in sequences)
x= pad_sequences(sequences,maxlen=maxlen)

# label Encoding
label_encoder = LabelEncoder()
y=label_encoder.fit_transform(data["label"])

#train_test_split
X_train ,X_test, y_train, y_test = train_test_split(x ,y ,test_size= 0.3, random_state=42)

# Word embedding
# Word2Vec Eğitimi: Kelimelerin birbirine göre konumlarını belirler.
sentences = [text.split() for text in data["text"]]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1)

embedding_dim = 100
embedding_matrix=np.zeros((len(word_index) +1, embedding_dim))
for word, i in word_index.items():
  if word in word2vec_model.wv:
    embedding_matrix[i] = word2vec_model.wv[word]

#build RNN model
model = Sequential()
model.add(Embedding(input_dim=len(word_index)+1, output_dim = embedding_dim, weights= [embedding_matrix],input_length = maxlen, trainable = False))
model.add(SimpleRNN(100,return_sequences=False))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(X_train,y_train,epochs= 10, batch_size = 2,validation_data=(X_test,y_test))

print(" ")
loss, accuracy = model.evaluate(X_test, y_test)
print("Test loss: ", loss)
print("Test accuracy: ", accuracy)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def classify_sentence(sentence):
  seq = tokenizer.texts_to_sequences([sentence])
  padded_seq = pad_sequences(seq,maxlen = maxlen)

  prediction = model.predict(padded_seq)
  predicted_class = (prediction > 0.5).astype(int)
  label = "pozitif" if predicted_class[0][0] == 1 else "negatif"
  return label

sentence = "The dialogues were cringe-worthy."
result = classify_sentence(sentence)
print(result)

In [None]:
#--------TEXT CLASSIFIER ---------
import pandas as pd
df_spam = pd.read_csv("/content/drive/MyDrive/sms_spam.csv",encoding="latin-1")
df_spam.head(5)

In [None]:
df_spam.columns

In [None]:
df_spam.columns =["label","text"]

In [None]:
df_spam.isnull().sum()

In [None]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

text = list(df_spam['text'])

lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(len(text)):
  r = re.sub("[^a-zA-Z]"," ",text[i])
  r = r.lower()
  r = r.split()
  r = [word for word in r if word not in set(stopwords.words("english"))]
  r = [lemmatizer.lemmatize(word) for word in r]
  r = " ".join(r)
  corpus.append(r)

df_spam["text2"] = corpus


In [None]:
# train test split
X = df_spam["text2"]
y = df_spam["label"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)


In [None]:
# feature extraction
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train = cv.fit_transform(X_train)


In [None]:
#classifier training
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)


In [None]:
X_test = cv.transform(X_test)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
print(accuracy_score(y_test, y_pred))