In [None]:
from google.colab import files

# Kaggle JSON'u Yükleme
files.upload()

# Kaggle JSON'u Doğru Konuma Taşıma
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Kaggle Veri Setini İndirme
!kaggle datasets download -d kazanova/sentiment140
!unzip sentiment140.zip

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 96% 78.0M/80.9M [00:03<00:00, 31.3MB/s]
100% 80.9M/80.9M [00:03<00:00, 24.3MB/s]
Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [None]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Verilerin Yüklenmesi ve Ön İşlenmesi
def load_and_preprocess_data(file_path, frac=0.1):
    print("Loading and preprocessing data...")
    data = pd.read_csv(file_path, encoding='latin1', header=None)
    data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

    data = data[['target', 'text']]
    data['sentiment'] = data['target'].apply(lambda x: 1 if x == 4 else 0)
    data = data[['text', 'sentiment']]

    print("Balancing data by sampling...")
    positive_tweets = data[data['sentiment'] == 1]
    negative_tweets = data[data['sentiment'] == 0]

    # Her sınıftan belirli bir oranda örnek al (%10)
    positive_sample = positive_tweets.sample(frac=frac, random_state=42)
    negative_sample = negative_tweets.sample(frac=frac, random_state=42)

    # Dengeli veri kümesini birleştir
    balanced_data = pd.concat([positive_sample, negative_sample]).sample(frac=0.3, random_state=42)

    print("Cleaning text data...")
    balanced_data['cleaned_text'] = balanced_data['text'].str.replace(r"http\S+|www.\S+", "", regex=True)
    balanced_data['cleaned_text'] = balanced_data['cleaned_text'].str.replace(r"[^a-zA-Z\s]", "", regex=True).str.lower()

    print(f"Data loaded and preprocessed. Total rows: {len(balanced_data)}")
    return balanced_data

# 2. KNN + Chunking ile Benzerlik Grafiği Oluşturma
def build_similarity_graph_knn_chunking(data, vectorizer, chunk_size=500, n_neighbors=10, threshold=0.5):
    print("Building similarity graph using KNN with chunking...")
    similarity_graph = nx.Graph()

    tfidf_matrix = vectorizer.transform(data['cleaned_text'])

    for chunk_idx, chunk_start in enumerate(range(0, tfidf_matrix.shape[0], chunk_size)):
        chunk_end = min(chunk_start + chunk_size, tfidf_matrix.shape[0])
        chunk_matrix = tfidf_matrix[chunk_start:chunk_end]

        # KNN modeli oluştur ve komşuları bul
        nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine').fit(tfidf_matrix)
        distances, indices = nn_model.kneighbors(chunk_matrix)

        for idx, neighbors in enumerate(indices):
            global_idx = chunk_start + idx
            similarity_graph.add_node(global_idx, sentiment=data.iloc[global_idx]['sentiment'])
            for neighbor_idx, distance in zip(neighbors, distances[idx]):
                similarity = 1 - distance  # Cosine distance -> similarity
                if similarity > threshold:
                    similarity_graph.add_edge(global_idx, neighbor_idx, weight=similarity)

        print(f"Processed chunk {chunk_idx + 1}/{tfidf_matrix.shape[0] // chunk_size + 1}")

    print(f"Graph created with {similarity_graph.number_of_nodes()} nodes and {similarity_graph.number_of_edges()} edges.")
    return similarity_graph



# 3. Malatya Centrality Hesaplama
def calculate_malatya_centrality(graph):
    print("Calculating Malatya Centrality...")
    centrality = {}
    for node in graph.nodes:
        neighbors = list(graph.neighbors(node))
        if not neighbors:
            centrality[node] = 0
        else:
            centrality[node] = sum(graph.degree[node] / graph.degree[n] for n in neighbors)
    print("Malatya Centrality calculated.")
    return centrality

def predict_sentiment(new_tweet, graph, vectorizer, tfidf_matrix, chunk_size=500, n_neighbors=10, threshold=0.5):
    print(f"Predicting sentiment for the tweet: {new_tweet}")
    new_tfidf = vectorizer.transform([new_tweet])

    new_node_id = len(graph.nodes)
    graph.add_node(new_node_id, text=new_tweet, sentiment=None)

    for chunk_start in range(0, tfidf_matrix.shape[0], chunk_size):
        chunk_end = min(chunk_start + chunk_size, tfidf_matrix.shape[0])
        chunk_matrix = tfidf_matrix[chunk_start:chunk_end]

        nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine').fit(chunk_matrix)
        distances, indices = nn_model.kneighbors(new_tfidf)

        for idx, distance in enumerate(distances[0]):
            similarity = 1 - distance
            global_idx = chunk_start + indices[0][idx]
            if similarity > threshold:
                graph.add_edge(new_node_id, global_idx, weight=similarity)

    print(f"New node {new_node_id} added to graph with {len(graph.edges(new_node_id))} edges.")

    # Malatya Centrality hesapla
    centrality = calculate_malatya_centrality(graph)

    positive_centrality = sum(
        centrality[node] for node in graph.neighbors(new_node_id) if graph.nodes[node].get('sentiment') == 1
    )
    negative_centrality = sum(
        centrality[node] for node in graph.neighbors(new_node_id) if graph.nodes[node].get('sentiment') == 0
    )

    print(f"Positive centrality for the new tweet: {positive_centrality}")
    print(f"Negative centrality for the new tweet: {negative_centrality}")

    result = "Positive" if positive_centrality > negative_centrality else "Negative"
    print(f"Predicted sentiment: {result}")
    return result



# 5. Ana Program
if __name__ == "__main__":
    # Verileri yükle ve örnekle
    data = load_and_preprocess_data("training.1600000.processed.noemoticon.csv", frac=0.3)

    # TF-IDF vektörleştirici oluştur
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(data['cleaned_text'])

    # KNN + Chunking ile benzerlik grafiği oluştur
    similarity_graph = build_similarity_graph_knn_chunking(data, vectorizer, chunk_size=1000, n_neighbors=10, threshold=0.5)

    # Malatya Centrality değerlerini hesapla
    malatya_centrality = calculate_malatya_centrality(similarity_graph)
    nx.set_node_attributes(similarity_graph, malatya_centrality, 'malatya_centrality')

    # Kullanıcıdan tweet girişini al ve tahmin yap
    print("Enter your tweets below (type 'exit' to stop):")
    while True:
        new_tweet = input("Enter a tweet: ")
        if new_tweet.lower() == 'exit':
            print("Exiting...")
            break
        sentiment = predict_sentiment(new_tweet, similarity_graph, vectorizer, tfidf_matrix)
        print(f"The sentiment of the tweet is: {sentiment}")


Loading and preprocessing data...
Balancing data by sampling...
Cleaning text data...
Data loaded and preprocessed. Total rows: 144000
Building similarity graph using KNN with chunking...
Processed chunk 1/145
Processed chunk 2/145
Processed chunk 3/145
Processed chunk 4/145
Processed chunk 5/145
Processed chunk 6/145
Processed chunk 7/145
Processed chunk 8/145
Processed chunk 9/145
Processed chunk 10/145
Processed chunk 11/145
Processed chunk 12/145
Processed chunk 13/145
Processed chunk 14/145
Processed chunk 15/145
Processed chunk 16/145
Processed chunk 17/145
Processed chunk 18/145
Processed chunk 19/145
Processed chunk 20/145
Processed chunk 21/145
Processed chunk 22/145
Processed chunk 23/145
Processed chunk 24/145
Processed chunk 25/145
Processed chunk 26/145
Processed chunk 27/145
Processed chunk 28/145
Processed chunk 29/145
Processed chunk 30/145
Processed chunk 31/145
Processed chunk 32/145
Processed chunk 33/145
Processed chunk 34/145
Processed chunk 35/145
Processed chunk 