In [None]:
import os
import pandas as pd
import numpy as np
from keras.preprocessing.image import load_img, img_to_array
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset with labels
dataset_df = pd.read_csv('/content/dataset_with_labels.csv')
llama_mapping = {
    'А': 1, 'Б': 2, 'В': 3, 'Г': 4, 'Д': 5, 'Е': 6, 'Ё': 7, 'Ж': 8, 'З': 9, 'И': 10,
    'Й': 11, 'К': 12, 'Л': 13, 'М': 14, 'Н': 15, 'О': 16, 'П': 17, 'Р': 18, 'С': 19,
    'Т': 20, 'У': 21, 'Ф': 22, 'Х': 23, 'Ц': 24, 'Ч': 25, 'Ш': 26, 'Щ': 27, 'Ъ': 28,
    'Ы': 29, 'Ь': 30, 'Э': 31, 'Ю': 32, 'Я': 33,
}
# Add more mappings as needed

# Apply LLama encoding to labels
dataset_df['Encoded_Labels'] = dataset_df['label'].map(llama_mapping)
dataset_df = dataset_df.iloc[:-1, :]
# Split data into features (images) and labels
X = np.array(dataset_df)  # Assuming the first two columns are indices and 'label'
y = dataset_df['Encoded_Labels'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
not_nan_indices_train = ~np.isnan(y_train)
y_train = y_train[not_nan_indices_train]
X_train = X_train[not_nan_indices_train]

# Remove corresponding rows from y_test and X_test
not_nan_indices_test = ~np.isnan(y_test)
y_test = y_test[not_nan_indices_test]
X_test = X_test[not_nan_indices_test]
print(X_test.shape)
# Reshape images for CNN input (assuming images are grayscale with dimensions 28x28)
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)  # Reshape for Conv2D input
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1)
unique_values_train = np.unique(y_train)
unique_values_test = np.unique(y_test)
print("Unique values in y_train:", unique_values_train)
print("Unique values in y_test:", unique_values_test)

# Convert labels to categorical format
y_train = to_categorical(y_train, num_classes=34)
y_test = to_categorical(y_test, num_classes=34)

# Build CNN model
model = Sequential()
model.add(Conv2D(35, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(len(llama_mapping) + 1, activation='softmax'))  # Output layer with LLama mappings count

# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=35, validation_data=(X_test, y_test))
y_pred = model.predict(X_test)
predicted_labels = np.argmax(y_pred, axis=1)
true_labels = np.argmax(y_test, axis=1)
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")




(605, 786)


ValueError: cannot reshape array of size 1906836 into shape (2426,28,28,1)

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the dataset with labels
dataset_df = pd.read_csv('/content/dataset_with_labels.csv')
llama_mapping = {
    'А': 1, 'Б': 2, 'В': 3, 'Г': 4, 'Д': 5, 'Е': 6, 'Ё': 7, 'Ж': 8, 'З': 9, 'И': 10,
    'Й': 11, 'К': 12, 'Л': 13, 'М': 14, 'Н': 15, 'О': 16, 'П': 17, 'Р': 18, 'С': 19,
    'Т': 20, 'У': 21, 'Ф': 22, 'Х': 23, 'Ц': 24, 'Ч': 25, 'Ш': 26, 'Щ': 27, 'Ъ': 28,
    'Ы': 29, 'Ь': 30, 'Э': 31, 'Ю': 32, 'Я': 33,
}

# Apply LLama encoding to labels
dataset_df['label'] = dataset_df['label'].map(llama_mapping)
y = dataset_df['label'].values
dataset_df.drop(columns=['label'], inplace=True)
# Split data into features (images) and labels
X = np.array(dataset_df)  # Assuming the first two columns are indices and 'label'


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=22)
not_nan_indices_train = ~np.isnan(y_train)
y_train = y_train[not_nan_indices_train]
X_train = X_train[not_nan_indices_train]

# Remove corresponding rows from y_test and X_test
not_nan_indices_test = ~np.isnan(y_test)
y_test = y_test[not_nan_indices_test]
X_test = X_test[not_nan_indices_test]

# Create and train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=22)
clf.fit(X_train, y_train)
print(X_train.shape)
# Make predictions
y_pred = clf.predict(X_test)
reverse_mapping = {v: k for k, v in llama_mapping.items()}
y_test_original = np.array([reverse_mapping[label] for label in y_test])
y_pred_original = np.array([reverse_mapping[label] for label in y_pred])
results = pd.DataFrame({'Actual': y_test_original, 'Predicted': y_pred_original})
print(results)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


(1520, 784)
     Actual Predicted
0         Ъ         Ь
1         П         Л
2         С         С
3         С         С
4         Щ         Ш
...     ...       ...
1507      М         М
1508      Н         Ч
1509      Ю         Й
1510      Я         Н
1511      Ы         Ы

[1512 rows x 2 columns]
Test Accuracy: 49.40%


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the dataset including labels in the last row
dataset_df = pd.read_csv('/content/dataset_with_labels.csv')

# Extract the labels from the last row
llama_labels = dataset_df.iloc[-1, :].values  # Assuming labels are in the last row and all other rows are data

# Remove the last row from the dataset
dataset_df = dataset_df.iloc[:-1, :]

# Apply LLama encoding to labels
llama_mapping = {
    'А': 1, 'Б': 2, 'В': 3, 'Г': 4, 'Д': 5, 'Е': 6, 'Ё': 7, 'Ж': 8, 'З': 9, 'И': 10,
    'Й': 11, 'К': 12, 'Л': 13, 'М': 14, 'Н': 15, 'О': 16, 'П': 17, 'Р': 18, 'С': 19,
    'Т': 20, 'У': 21, 'Ф': 22, 'Х': 23, 'Ц': 24, 'Ч': 25, 'Ш': 26, 'Щ': 27, 'Ъ': 28,
    'Ы': 29, 'Ь': 30, 'Э': 31, 'Ю': 32, 'Я': 33,
}
dataset_df['Encoded_Labels'] = dataset_df['label'].map(llama_mapping)

# Split data into features (images) and labels
X = np.array(dataset_df.iloc[:, :1])  # Assuming the first column is the data
y = dataset_df['Encoded_Labels'].values

# Check if X and y have the same number of samples
if len(X) != len(y):
    raise ValueError("Number of samples in X and y are not equal.")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=22,)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Now you have trained and evaluated your model with labels split off from the last row of the CSV file.


ValueError: Input y contains NaN.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Load the test dataset
test_df = pd.read_csv('/content/test_dataset.csv')

# Assuming the data in 'test.csv' is similar in structure to the training data
# Apply LLama encoding to the test data

X_test = test_df.to_numpy() # Assuming the first two columns are indices and 'label'
not_nan_indices_test = ~np.isnan(X_test)
X_test = X_test[not_nan_indices_test]
print(X_test.shape)
X_test=X_test.reshape(12399,784)
y_pred = clf.predict(X_test)

# Convert the predicted labels back to Cyrillic characters
predicted_labels = {v: k for k, v in llama_mapping.items()}
predicted_cyrillic_labels = [predicted_labels[y] for y in y_pred]

# Add the predicted Cyrillic labels to the test dataframe
test_df['Predicted_Labels'] = predicted_cyrillic_labels

test = pd.read_csv(r"/content/test.csv")
# Merge the 'predicted_labels' column into the test dataframe
test['label'] =test_df['Predicted_Labels']
test.to_csv('merged.csv', index=False)


(9720816,)


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.utils import to_categorical

# Load the dataset CSV file
dataset_df = pd.read_csv('/content/dataset_with_labels.csv')  # Update the file path

# Encode the labels using LabelEncoder
encoder = LabelEncoder()
dataset_df['Encoded_Labels'] = encoder.fit_transform(dataset_df['label'])

# Split data into features (images) and labels
X = np.array(dataset_df.iloc[:, 1:])  # Assuming the first column is the label
y = to_categorical(dataset_df['Encoded_Labels'])  # One-hot encode the labels

# Reshape the features for Conv2D input
X = X.reshape(X.shape[0], 28, 28, 1)  # Assuming images are grayscale with dimensions 28x28

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(len(encoder.classes_), activation='softmax'))  # Output layer with number of classes

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


ValueError: cannot reshape array of size 2418585 into shape (3081,28,28,1)