In [1]:
# utilities
import re
import pickle
import numpy as np
import pandas as pd

# important libraries
from bokeh.plotting import figure
from bokeh.io import output_file, show, output_notebook
from collections import Counter
import spacy
from spacy.util import compounding
from spacy.util import minibatch
from spacy import displacy
import gc
import os
from scipy.stats import skew, kurtosis

# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Set the folder path
user_path = r"E:\Data_Mining_project\Data mining data" #give file path of Data_Mining_project

# Define what fraction of each file to sample
sample_fraction = 0.01  # 5% sample
random_state = 42       # for reproducibility

#Gather all CSV paths
csv_collection = []
for dirname, _, filenames in os.walk(user_path):
    for filename in filenames:
        if filename.endswith('.csv') or filename.endswith('.csv.gz'):
            csv_collection.append(os.path.join(dirname, filename))

print(f" Total CSV files found: {len(csv_collection)}")

#Load and sample from each file
sampled_chunks = []

for i, file in enumerate(csv_collection):
    try:
        df_chunk = pd.read_csv(file, index_col=0, low_memory=False)
        sample = df_chunk.sample(frac=sample_fraction, random_state=random_state)
        sampled_chunks.append(sample)
        print(f" Sampled {len(sample)} rows from: {file}")
    except Exception as e:
        print(f" Error loading {file}: {e}")

#Concatenate all sampled parts
df = pd.concat(sampled_chunks, axis=0, ignore_index=True)
print(f"\n Final sampled DataFrame shape: {df.shape}")

 Total CSV files found: 461
 Sampled 3649 rows from: E:\Data_Mining_project\Data mining data\0401_UkraineCombinedTweetsDeduped.csv
 Sampled 3710 rows from: E:\Data_Mining_project\Data mining data\0402_UkraineCombinedTweetsDeduped.csv
 Sampled 4455 rows from: E:\Data_Mining_project\Data mining data\0403_UkraineCombinedTweetsDeduped.csv
 Sampled 4304 rows from: E:\Data_Mining_project\Data mining data\0404_UkraineCombinedTweetsDeduped.csv
 Sampled 4526 rows from: E:\Data_Mining_project\Data mining data\0405_UkraineCombinedTweetsDeduped.csv
 Sampled 4228 rows from: E:\Data_Mining_project\Data mining data\0406_UkraineCombinedTweetsDeduped.csv
 Sampled 3787 rows from: E:\Data_Mining_project\Data mining data\0407_UkraineCombinedTweetsDeduped.csv
 Sampled 4055 rows from: E:\Data_Mining_project\Data mining data\0408_UkraineCombinedTweetsDeduped.csv
 Sampled 4030 rows from: E:\Data_Mining_project\Data mining data\0409_UkraineCombinedTweetsDeduped.csv
 Sampled 3731 rows from: E:\Data_Mining_proje

In [3]:
# Save to CSV
df.to_csv("E:/Data_Mining_project/sampled_dataset_1%.csv", index=False)
print(" Sampled dataset saved to: E:/Data_Mining_project/sampled_dataset.csv")

 Sampled dataset saved to: E:/Data_Mining_project/sampled_dataset.csv


In [4]:
# Print the number of rows and columns in the DataFrame
print(df.shape)

(708764, 28)
