### Import libraries

In [2]:
import pandas as pd
import json
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import download
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Download NLTK resources
download('punkt')
download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Converting data to json

In [3]:
# Step 1: Load and Parse `tweets.js`
def load_tweets(file_path):
    with open(file_path, encoding='utf-8') as file:
        content = file.read()
        json_data = content.split('=', 1)[1].strip().rstrip(';')
        tweets = json.loads(json_data)
    return tweets

# Step 2: Load and Parse `like.js`
def load_likes(file_path):
    with open(file_path, encoding='utf-8') as file:
        content = file.read()
        json_data = content.split('=', 1)[1].strip().rstrip(';')
        likes = json.loads(json_data)
    return likes

In [9]:
# Load datasets
tweets = load_tweets('tweets.js')
likes = load_likes('like.js')

# Flatten tweets
tweets_df = pd.json_normalize([item['tweet'] for item in tweets if 'tweet' in item])

# Flatten the 'like' key from the likes JSON
likes_df = pd.json_normalize([item['like'] for item in likes if 'like' in item])

# Rename the columns for consistency
likes_df.rename(columns={'tweetId': 'liked_tweet_id', 'fullText': 'liked_tweet_text'}, inplace=True)
tweets_df.rename(columns={'created_at': 'tweet_created_at', 'full_text': 'tweet_text', 'id_str': 'tweet_id'}, inplace=True)


In [10]:
# Step 3: Merge Tweets and Likes
combined_df = tweets_df.merge(
    likes_df, left_on='tweet_id', right_on='liked_tweet_id', how='left', suffixes=('', '_liked')
)

# Add a binary column indicating if the tweet was liked
combined_df['liked'] = combined_df['liked_tweet_id'].notna()


In [11]:
# Step 4: Clean Tweet Text
def clean_text(text):
    """
    Cleans the tweet text by removing URLs, mentions, hashtags, and special characters.
    """
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\S+', '', text)    # Remove mentions
    text = re.sub(r'#\S+', '', text)    # Remove hashtags
    text = re.sub(r'[^A-Za-zğüşöçıİĞÜŞÖÇ ]', '', text)  # Remove special characters
    return text.strip().lower()

combined_df['clean_text'] = combined_df['tweet_text'].apply(clean_text)

In [14]:
# Step 5: Tokenize and Remove Stopwords
stop_words = set(stopwords.words('turkish'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

combined_df['tokens'] = combined_df['clean_text'].apply(tokenize_and_remove_stopwords)

In [15]:
# Step 6: Add Time-Based Features
combined_df['tweet_created_at'] = pd.to_datetime(combined_df['tweet_created_at'])
combined_df['day_of_week'] = combined_df['tweet_created_at'].dt.day_name()
combined_df['hour'] = combined_df['tweet_created_at'].dt.hour

  combined_df['tweet_created_at'] = pd.to_datetime(combined_df['tweet_created_at'])


In [16]:
# Step 7: Handle Missing Data
combined_df = combined_df.dropna(subset=['clean_text', 'tweet_created_at'])
combined_df = combined_df[combined_df['clean_text'] != '']

In [17]:
# Step 8: Save Preprocessed Data
combined_df.to_csv('combined_preprocessed_data.csv', index=False, encoding='utf-8')
print("Combined preprocessed data saved to 'combined_preprocessed_data.csv'")

Combined preprocessed data saved to 'combined_preprocessed_data.csv'


In [18]:
print(combined_df.head())
print(combined_df.info())


   retweeted                                             source  \
0      False  <a href="http://twitter.com/download/iphone" r...   
1      False  <a href="http://twitter.com/download/iphone" r...   
2      False  <a href="http://twitter.com/download/iphone" r...   
3      False  <a href="http://twitter.com/download/iphone" r...   
4      False  <a href="http://twitter.com/download/iphone" r...   

  display_text_range favorite_count             tweet_id  truncated  \
0           [0, 140]              0  1871313593029083255      False   
1            [0, 74]              0  1870850626906923460      False   
2           [0, 139]              0  1870835688226410871      False   
3            [0, 40]              0  1869631807110717867      False   
4            [0, 90]              5  1868700830897189196      False   

  retweet_count                   id          tweet_created_at  favorited  \
0             0  1871313593029083255 2024-12-23 21:55:02+00:00      False   
1             0 