<a href="https://colab.research.google.com/github/sumeyyedemir5/nlp-preprocessing_and_textRepresentation/blob/main/Text_preprocessing_and_Text_Representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download("wordnet")

In [None]:
from nltk.stem import PorterStemmer
# Stemming: Kelimeyi eklerinden ayırıp köküne iner (örn: "running" -> "run")
stemmer = PorterStemmer()
words= ["running","runner","runs","go","went"]
stems = [stemmer.stem(w) for w in words]
stems

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Örnek kelimeler
words = ["running", "runner", "ran", "runs", "better", "go", "went"]
# Lemmatization: Kelimeyi sözlükteki kök haline (lemma) çevirir (Daha anlamlıdır, örn: "went" -> "go")
lemmas = [lemmatizer.lemmatize(w, pos="v") for w in words]
# 'v' fiil olarak kök bulmasını sağlar

print("Lemma result: ",lemmas)


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
# Stopwords: "this, is, an, the" gibi tek başına anlam ifade etmeyen yaygın kelimelerin temizlenmesi
stop_words_eng = set(stopwords.words("english"))

In [None]:
text = "this is an example of removing stop words from a text document"
filtered_text = [word for word in text.split() if word.lower() not in stop_words_eng]
filtered_text

In [None]:
nltk.download("punkt_tab")
text = "Hello World, 2025"
# Tokenization: Metni kelimelere (word) veya cümlelere (sentence) parçalama işlemidir
word_tokens = nltk.word_tokenize(text)
sentence_tokens = nltk.sent_tokenize(text)
word_tokens

# Metin Temsili (Text Representation)
Metinleri sayılara dönüştürme yöntemleridir.
1.   **BoW (Bag of Words)**

* Kelimelerin cümle içindeki sırasını önemsemeden sadece frekansına (kaç kere geçtiğine) bakar.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
documents = [
    "Kedi evde",
    "Kedi bahçede"
]
vectorizer = CountVectorizer()
x= vectorizer.fit_transform(documents)
print("kelime kümesi: ", vectorizer.get_feature_names_out())

In [None]:
print("vektör kümesi:\n",x.toarray())

**1.satır ("Kedi evde") → [0 1 1]**

"bahçede" kelimesi: 0 kez

"evde" kelimesi: 1 kez

"kedi" kelimesi: 1 kez

**2.satır ("Kedi bahçede") → [1 0 1]**

"bahçede": 1 kez

"evde": 0 kez

"kedi": 1 kez

In [None]:
import pandas as pd
import re
from collections import Counter

df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv", encoding="utf-8")
df = df.head(50)
df

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words_eng = set(stopwords.words("english"))
documents = df['review']
labels = df["sentiment"] #positive or negative
#text cleaning func
def clean_text(text):
  text = text.lower() #lowercase conversion
  text = re.sub(r"\d+","",text) #cleaning the numbers
  text = re.sub(r"[^\w\s]","",text) #cleaning the special chars
  #cleaning short words
  text = " ".join([word for word in text.split() if len(word) > 2])
  #cleaning stopwords
  text = " ".join([word for word in text.split() if word.lower() not in stop_words_eng])
  return text
cleaned_doc = [clean_text(doc) for doc in documents]

In [None]:
cleaned_doc

In [None]:
vectorizer = CountVectorizer()
X= vectorizer.fit_transform(cleaned_doc)
feature_names = vectorizer.get_feature_names_out()
vektor2 = X.toarray()[:2]
vektor2

In [None]:
df_bow = pd.DataFrame(X.toarray(), columns = feature_names) #vektor temsili
#kelime frekansı
word_counts = X.sum(axis=0).A1
word_freq = dict(zip(feature_names,word_counts))
most_common_words = Counter(word_freq).most_common(5) #en çok tekrar eden 5 kelime
most_common_words

**2.   TF-IDF (Term Frequency - inverse document frequency)**
TF : kelimenin ne kadar sık geçtiğini ölçer
IDF : kelimenin tüm belgedeki yaygınlığını ölçer. çok fazla bulunan kelimeler çok bilgi sağlamaz.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
doc = [
    "kedi çok tatlı bir hayvandır",
    "kedi ve köpekler çok tatlı hayvanlardır"
]
tfidfvector = TfidfVectorizer()
X = tfidfvector.fit_transform(doc)
feature_names = tfidfvector.get_feature_names_out()
df_tfidf = pd.DataFrame(X.toarray(),columns = feature_names)
df_tfidf

Kelimenin metin içindeki önemini gösteren TF-IDF değerlerinin ortalaması

In [None]:
kedi_tfidf = df_tfidf["kedi"]
kedi_mean_tfidf = np.mean(kedi_tfidf)
kedi_mean_tfidf

In [None]:
df2 = pd.read_csv("/content/drive/MyDrive/sms_spam.csv")
df2

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df2.text)
X

In [None]:
feature_names = vectorizer.get_feature_names_out
tfidf_score = X.mean(axis=0).A1 #ortalama TF-IDF değerleri
df_tfidf = pd.DataFrame({"word":feature_names,"score":tfidf_score})
df_tfidf

**N-GRAM MODELLİNG**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
documents = [
    "bu bir örnek metindir",
    "bu örnek metin doğal dil işlemeyi gösterir"
]
vectorizer_unigram = CountVectorizer(ngram_range=(1,1))
vectorizer_bigram = CountVectorizer(ngram_range=(2,2))
vectorizer_trigram = CountVectorizer(ngram_range=(3,3))

X_unigram = vectorizer_unigram.fit_transform(documents)
unigram_features = vectorizer_unigram.get_feature_names_out()
unigram_features

In [None]:
X_bigram = vectorizer_bigram.fit_transform(documents)
bigram_features = vectorizer_bigram.get_feature_names_out()
bigram_features

In [None]:
X_trigram = vectorizer_trigram.fit_transform(documents)
trigram_features = vectorizer_trigram.get_feature_names_out()
trigram_features