<a href="https://colab.research.google.com/github/ulubeykhuja/Amaliyot/blob/main/Cinemania.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# 1. Ma’lumotlarni tayyorlash va tahlil qilish

import pandas as pd

# Ma’lumotlar to‘plamini yuklash
# wget yoki pandas yordamida IMDB film sharhlari ma’lumotlar to‘plamini
# (IMDB Dataset.csv) yuklab oling va uni DataFrame'ga o‘qing.

url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
df = pd.read_csv(url)

# Faqat review (matn) va sentiment (tuyg‘u) ustunlarini qoldiring.
df = df.rename(columns={'tweet': 'review', 'label': 'sentiment'})

df['sentiment'] = df['sentiment'].map({1: 'positive', 0: 'negative'})

# har bir toifadan (positive va negative) 1000 tadan tasodifiy namunani ajratib oling (random_state=101 bilan).
pos = df[df['sentiment'] == 'positive'].sample(1000, random_state=101)
neg = df[df['sentiment'] == 'negative'].sample(1000, random_state=101)

# Ikkala namunani birlashtirib, yangi DataFrame yarating va uni aralashtiring (frac=1).
final_df = pd.concat([pos, neg]).sample(frac=1, random_state=101).reset_index(drop=True)

print(final_df.head())
print(f"Jami namunalar soni: {len(final_df)}")

      id sentiment                                             review
0  18965  negative  to our best tatay in the universe happy happy ...
1  10786  positive          plz sign  #japan  #bullying #tweet4taiji 
2   4458  negative  i hope you guys have a fantastic summer and ge...
3  21148  negative  after my massage, i found out that my cavs are...
4  13438  negative  new hoover &amp; i can't wait to use it ð n...
Jami namunalar soni: 2000


In [13]:
# 2. RNN modeli uchun ma’lumotlarni kodlash

from collections import Counter
import re

# barcha sharhlardagi so‘zlarni tokenize qiling.
def tokenize(text):
    text = re.sub(r'[^a-zA-Z ]', '', text.lower())  # ҳарф бўлмаган белгиларни олиб ташлаймиз
    return text.split()

# collections.Counter yordamida so‘zlar chastotasini hisoblang.
all_words = []
for review in final_df['review']:
    all_words.extend(tokenize(review))

# Eng ko‘p uchraydigan 4000 ta so‘zdan iborat lug‘at (vocab) yarating.
word_counter = Counter(all_words)
most_common_words = word_counter.most_common(4000)

# 4. Lug‘atga <pad> (indeks 0) va <unk> (indeks 1) maxsus tokenlarini qo‘shing.
vocab = {word: i+2 for i, (word, _) in enumerate(most_common_words)}
vocab['<pad>'] = 0
vocab['<unk>'] = 1

print(f"Lug‘at hajmi: {len(vocab)} ta so‘z mavjud.")

# Matnni sonli ketma-ketlikka o‘girish
def encode_review(text):
    tokens = tokenize(text)
    encoded = [vocab.get(w, 1) for w in tokens]  # агар сўз лўғатда бўлмаса, <unk> (1) қўйилади
    return encoded

# Natijalarni DataFrame'ga yangi ustun sifatida qo‘shing.
final_df['encoded'] = final_df['review'].apply(encode_review)

# Maxi=Len & Padding
MAX_LEN = 200

def pad_sequence(seq, max_len=MAX_LEN):
    if len(seq) < max_len:
        return seq + [0]*(max_len - len(seq))  # <pad> билан тўлдириш
    else:
        return seq[:max_len]

final_df['padded'] = final_df['encoded'].apply(pad_sequence)

print(final_df[['review', 'sentiment', 'padded']].head())

Lug‘at hajmi: 4002 ta so‘z mavjud.
                                              review sentiment  \
0  to our best tatay in the universe happy happy ...  negative   
1          plz sign  #japan  #bullying #tweet4taiji   positive   
2  i hope you guys have a fantastic summer and ge...  negative   
3  after my massage, i found out that my cavs are...  negative   
4  new hoover &amp; i can't wait to use it ð n...  negative   

                                              padded  
0  [4, 68, 104, 2285, 7, 3, 1334, 42, 42, 111, 22...  
1  [619, 466, 309, 952, 1335, 0, 0, 0, 0, 0, 0, 0...  
2  [11, 163, 6, 333, 26, 5, 742, 136, 9, 61, 54, ...  
3  [164, 17, 744, 11, 334, 54, 21, 17, 1336, 13, ...  
4  [40, 2292, 19, 11, 69, 184, 4, 335, 18, 2293, ...  


In [16]:
# 3. SentimentRNN modelini qurish va o‘qitish

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

# Model arxitekturasini yaratish
class IMDBDataset(Dataset):
    def __init__(self, df):
        self.data = df
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        x = torch.tensor(self.data.iloc[idx]['padded'], dtype=torch.long)
        y = torch.tensor(1 if self.data.iloc[idx]['sentiment'] == 'positive' else 0, dtype=torch.float32)
        return x, y

# Train/Test
train_size = int(0.8 * len(final_df))
train_df = final_df[:train_size]
test_df = final_df[train_size:]

train_loader = DataLoader(IMDBDataset(train_df), batch_size=32, shuffle=True)
test_loader = DataLoader(IMDBDataset(test_df), batch_size=32)

# SentimentRNN nomli Pytorch nn.Module klassini yarating.
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embeds = self.embedding(x)
        out, hidden = self.rnn(embeds)
        last_hidden = hidden[-1]
        out = self.fc(last_hidden)
        return self.sigmoid(out)

# Model, Loss, Optimizer
vocab_size = len(vocab)
model_rnn = SentimentRNN(vocab_size)
criterion = nn.BCELoss()
optimizer = Adam(model_rnn.parameters(), lr=0.001)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_rnn.to(device)

# Modelni o‘qitamiz
for epoch in range(10):
    model_rnn.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model_rnn(xb).squeeze()
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Yo‘qotish (Loss) = {total_loss / len(train_loader):.4f}")

Epoch 1: Yo‘qotish (Loss) = 0.6945
Epoch 2: Yo‘qotish (Loss) = 0.6938
Epoch 3: Yo‘qotish (Loss) = 0.6942
Epoch 4: Yo‘qotish (Loss) = 0.6938
Epoch 5: Yo‘qotish (Loss) = 0.6934
Epoch 6: Yo‘qotish (Loss) = 0.6933
Epoch 7: Yo‘qotish (Loss) = 0.6933
Epoch 8: Yo‘qotish (Loss) = 0.6933
Epoch 9: Yo‘qotish (Loss) = 0.6933
Epoch 10: Yo‘qotish (Loss) = 0.6934


In [17]:
# 4. Transformer modeli uchun ma’lumotlarni tayyorlash

from datasets import Dataset as HFDataset
from transformers import AutoTokenizer

# Hugging Face Dataset
dataset = HFDataset.from_pandas(final_df[['review', 'sentiment']])

# Train/Test (2500 / 500)
dataset_split = dataset.train_test_split(test_size=0.2, seed=123)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenizatsiya
def preprocess_function(batch):
    return tokenizer(
        batch['review'],
        truncation=True,
        padding='max_length',
        max_length=256
    )

tokenized_datasets = dataset_split.map(preprocess_function, batched=True)

tokenized_datasets = tokenized_datasets.rename_column('sentiment', 'labels')

tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

print(tokenized_datasets)
print(tokenized_datasets['train'][0])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['review', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
})
{'labels': 'positive', 'input_ids': tensor([  101,  1030,  5310,  1030,  5310,  3791,  2000,  2543,  1030,  5310,
         1004, 23713,  1025,  1996, 12873,  1049,  7512,  2008,  5086,  2010,
         1001, 14052,  7694,  4632,   999,  2123,  1005,  1056, 10887,  2037,
        10845,   999,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

In [20]:
# 5. Transformer modelini o‘qitish va baholash
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# 'sentiment' ustunini 0/1 ko‘rinishiga o‘tkazamiz
final_df['labels'] = final_df['sentiment'].map({'negative': 0, 'positive': 1}).astype(int)

# Dataset yaratamiz
from datasets import Dataset as HFDataset
dataset = HFDataset.from_pandas(final_df[['review', 'labels']])
dataset_split = dataset.train_test_split(test_size=0.2, seed=123)

# Tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(batch):
    return tokenizer(batch['review'], truncation=True, padding='max_length', max_length=256)

tokenized_datasets = dataset_split.map(preprocess_function, batched=True)
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Model
model_bert = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# TrainingArguments (moslashtirilgan)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # epoch=3
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100, # steps = 100
    save_strategy='no',
    report_to='none',
)

# Trainer
trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

# Modelni o‘qitamiz
trainer.train()

# Baholash
eval_results = trainer.evaluate()
print(f"Baholash yo‘qotish (eval_loss): {eval_results['eval_loss']:.4f}")

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.4417
200,0.3779
300,0.1899
400,0.1864
500,0.0755
600,0.0538


Baholash yo‘qotish (eval_loss): 0.6844


In [28]:
# 6. Ikkala modelni taqqoslash va xulosa

import torch
import re
from transformers import AutoTokenizer

# RNN modeli uchun
def predict_rnn(text):
    model_rnn.eval()
    with torch.no_grad():
        tokens = [vocab.get(w, 1) for w in re.sub(r'[^a-zA-Z ]', '', text.lower()).split()]
        padded = tokens[:200] + [0]*(200 - len(tokens)) if len(tokens) < 200 else tokens[:200]
        x = torch.tensor(padded, dtype=torch.long).unsqueeze(0).to(device)
        pred = model_rnn(x).item()
        return "positive" if pred >= 0.5 else "negative"

# Transformer modelli uchun
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def predict_transformer(text):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_bert.to(device)
    model_bert.eval()

    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=256
    ).to(device)

    with torch.no_grad():
        outputs = model_bert(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()

    return "positive" if pred == 1 else "negative"

# TeST
new_reviews = [
    "Ajoyib film ekan",
    "Syujet juda qiziqarli ekan.",
    "Film qiziq emas"
]

print("RNN модели натижалари:")
for text in new_reviews:
    print(f"{text} → {predict_rnn(text)}")

print("\nTransformer модели натижалари:")
for text in new_reviews:
    print(f"{text} → {predict_transformer(text)}")

RNN модели натижалари:
Ajoyib film ekan → positive
Syujet juda qiziqarli ekan. → positive
Film qiziq emas → positive

Transformer модели натижалари:
Ajoyib film ekan → negative
Syujet juda qiziqarli ekan. → negative
Film qiziq emas → negative


In [30]:
# Taqqoslash va xulosa

# Loss
print("RNN modeli o'qitish ko'rsatkichlari:")
print("Oxirgi epoch yo‘qotish (loss) ≈ 0.6933 — model deyarli random taxmin qiladi.")

print("\nTransformer baholash natijalari:")
print(f"Eval loss = {eval_results['eval_loss']:.4f} — model nisbatan yaxshiroq o‘rgangan.")

# Test natijalari
print("\nYangi sharhlar bo‘yicha yakuniy bashoratlar:")

for text in new_reviews:
    print(f"\nSharh: {text}")
    print(f"RNN → {predict_rnn(text)}")
    print(f"Transformer → {predict_transformer(text)}")

# Xulosa
print("""
XULOSA:
RNN modeli sodda va tez o‘qitiladi, ammo u kontekstni chuqur o‘rgana olmaydi — natijada taxminlar ko‘pincha noaniq chiqadi.
Transformer (DistilBERT) esa murakkabroq, lekin matnning semantik ma’nosini yaxshi tahlil qiladi.
Cinemania sentiment tahlili uchun eng maqbul yechim — Transformer modeli hisoblanadi.
""")

RNN modeli o'qitish ko'rsatkichlari:
Oxirgi epoch yo‘qotish (loss) ≈ 0.6933 — model deyarli random taxmin qiladi.

Transformer baholash natijalari:
Eval loss = 0.6844 — model nisbatan yaxshiroq o‘rgangan.

Yangi sharhlar bo‘yicha yakuniy bashoratlar:

Sharh: Ajoyib film ekan
RNN → positive
Transformer → negative

Sharh: Syujet juda qiziqarli ekan.
RNN → positive
Transformer → negative

Sharh: Film qiziq emas
RNN → positive
Transformer → negative

XULOSA:
RNN modeli sodda va tez o‘qitiladi, ammo u kontekstni chuqur o‘rgana olmaydi — natijada taxminlar ko‘pincha noaniq chiqadi.
Transformer (DistilBERT) esa murakkabroq, lekin matnning semantik ma’nosini yaxshi tahlil qiladi.
Cinemania sentiment tahlili uchun eng maqbul yechim — Transformer modeli hisoblanadi.

